Add linux-mips32be target for new platform

Reviewed-by: Rich Salz <rsalz@openssl.org> Reviewed-by: Andy Polyakov <appro@openssl.org> (Merged from https://github.com/openssl/openssl/pull/3300) (cherry picked from commit d674242a88)
c6x/* "facelift":
2017-08-30 21:45:26 +01:00 · 2017-08-30 21:27:46 +01:00 · 2017-08-30 21:26:43 +01:00 · 2016-11-14 17:00:41 -05:00 · 2016-11-14 21:32:05 +01:00 · 2016-06-21 23:44:54 +02:00
148 changed files with 18915 additions and 3473 deletions
--- a/41
+++ b/41
@ -4,6 +4,47 @@

 Changes between 1.0.1 and 1.1.0  [xx XXX xxxx]

+  *) Add perl scripts to calculate FIPS signatures for Windows
+     exectuables including WinCE. 
+     [Andy Polyakov]
+
+  *) Don't attempt to insert current time into AES/3DES tests, we should
+     be just copying input line across and this breaks some systems lacking
+     ctime. 
+     [Steve Henson]
+
+  *) Update Windows build system for FIPS. Don't compile algorithm test
+     utilties by default: the target build_tests is needed for that. Add
+     support for building fips_algvs with the build_algvs target.
+     [Steve Henson]
+
+  *) Add initial cross compilation support for Windows build. The following
+     environment variables should be set:
+
+     FIPS_SHA1_PATH: path to fips_standalone_sha1 exectutable which will
+     be used explicitly and not built.
+     FIPS_SIG: similar to other builds: path to a "get signature" script
+     which is used to obtain the signature of the target instead of
+     executing it on the host.
+     [Steve Henson]
+
+  *) Add flag to EC_KEY to use cofactor ECDH if set.
+     [Steve Henson]
+
+  *) Update fips_test_suite to support multiple command line options. New
+     test to induce all self test errors in sequence and check expected
+     failures.
+     [Steve Henson]
+
+  *) Add FIPS_{rsa,dsa,ecdsa}_{sign,verify} functions which digest and
+     sign or verify all in one operation.
+     [Steve Henson]
+
+  *) Add fips_algvs: a multicall fips utility incorporaing all the algorithm
+     test programs and fips_test_suite. Includes functionality to parse
+     the minimal script output of fipsalgest.pl directly.
+     [Steve Henson]
+
  *) Add authorisation parameter to FIPS_module_mode_set().
     [Steve Henson]

--- a/37
+++ b/37
@ -132,14 +132,17 @@ my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o
 my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
 my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
 my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
-my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
+# EXTREME: original asm spec was missing colon and final term.
+#my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o::::::::";
+my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::::void";
 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";
 my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o s390x-gf2m.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:";
 my $armv4_asm="armcap.o armv4cpuid.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::void";
+my $aarch64_asm="armcap.o arm64cpuid.o mem_clr.o:::aes_core.o aes_cbc.o aesv8-armx.o:::sha1-armv8.o sha256-armv8.o sha512-armv8.o:::::::ghashv8-armx.o:";
 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::32";
 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o::64";
-my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o::::::::";
-my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o::::::::";
+my $ppc32_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:";
+my $ppc64_asm="ppccpuid.o ppccap.o:bn-ppc.o ppc-mont.o ppc64-mont.o::aes_core.o aes_cbc.o aes-ppc.o aesp8-ppc.o:::sha1-ppc.o sha256-ppc.o sha512-ppc.o sha256p8-ppc.o sha512p8-ppc.o:::::::ghashp8-ppc.o:";
 my $no_asm=":::::::::::::::void";

 # As for $BSDthreads. Idea is to maintain "collective" set of flags,
@ -341,6 +344,8 @@ my %table=(
 # *-generic* is endian-neutral target, but ./config is free to
 # throw in -D[BL]_ENDIAN, whichever appropriate...
 "linux-generic32","gcc:-DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+#### Extreme add linux-mips32be
+"linux-mips32be","gcc:-DB_ENDIAN -DTERMIO -O3 -march=mips32 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${mips32_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ppc",	"gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc32_asm}:linux32:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 # It's believed that majority of ARM toolchains predefine appropriate -march.
 # If you compiler does not, do complement config command line with one!
@ -356,6 +361,8 @@ my %table=(
 "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-x86_64",	"gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
+"linux-x86_64-cross",  "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DFIPS_REF_POINT_IS_CROSS_COMPILER_AWARE::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
+"linux-i686-cross",	"gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DFIPS_REF_POINT_IS_CROSS_COMPILER_AWARE::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux64-s390x",	"gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
 #### So called "highgprs" target for z/Architecture CPUs
 # "Highgprs" is kernel feature first implemented in Linux 2.6.32, see
@ -397,11 +404,14 @@ my %table=(
 "linux-alpha+bwx-gcc","gcc:-O3 -DL_ENDIAN -DTERMIO::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-alpha-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
 "linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
+# eCos ARMv4/5
+"ecos-armv4", "gcc:-D__ECOS__ -I\$(ECOSCFG)/include -Wall -Wpointer-arith -Wstrict-prototypes -Wundef -Wno-write-strings -mno-thumb-interwork -mcpu=arm926ej-s -g -O2 -fno-exceptions::-D_REENTRANT::-nostartfiles -L\$(ECOSCFG)/lib -Ttarget.ld::".eval{my $asm=$armv4_asm;$asm=~s/armcap.o//;$asm},

 # Android: linux-* but without -DTERMIO and pointers to headers and libs.
 "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "android-x86","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:".eval{my $asm=${x86_elf_asm};$asm=~s/:elf/:android/;$asm}.":dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
-"android-armv7","gcc:-march=armv7-a -mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"android-armv7","gcc:-march=armv7-a -mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-pie%-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"android64-aarch64","gcc:-mandroid -fPIC -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -Wall::-D_REENTRANT::-pie%-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${aarch64_asm}:linux64:dlfcn:linux-shared:::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",

 #### *BSD [do see comment about ${BSDthreads} above!]
 "BSD-generic32","gcc:-DTERMIOS -O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
@ -409,6 +419,8 @@ my %table=(
 "BSD-x86-elf",	"gcc:-DL_ENDIAN -DTERMIOS -O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:bsd-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "debug-BSD-x86-elf",	"gcc:-DL_ENDIAN -DTERMIOS -O3 -Wall -g::${BSDthreads}:::BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:bsd-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "BSD-sparcv8",	"gcc:-DB_ENDIAN -DTERMIOS -O3 -mv8 -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${sparcv8_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"BSD-ppc85xx","gcc:-DTERMIOS -O3 -fomit-frame-pointer -msoft-float -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"debug-BSD-ppc85xx","gcc:-DTERMIOS -O0 -fomit-frame-pointer -msoft-float -Wall -g::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",

 "BSD-generic64","gcc:-DTERMIOS -O3 -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 # -DMD32_REG_T=int doesn't actually belong in sparc64 target, it
@ -429,6 +441,7 @@ my %table=(
 # QNX
 "qnx4",	"cc:-DL_ENDIAN -DTERMIO::(unknown):::${x86_gcc_des} ${x86_gcc_opts}:",
 "QNX6",       "gcc:-DTERMIOS::::-lsocket::${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
+"QNX6-armv4",	"gcc:-DTERMIOS -O2 -Wall:::::BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "QNX6-i386",  "gcc:-DL_ENDIAN -DTERMIOS -O2 -Wall::::-lsocket:${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",

 # BeOS
@ -578,6 +591,10 @@ my %table=(
 "debug-darwin-i386-cc","cc:-arch i386 -g3 -DL_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:BN_LLONG RC4_INT RC4_CHUNK DES_UNROLL BF_PTR:${x86_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch i386 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
 "darwin64-x86_64-cc","cc:-arch x86_64 -O3 -DL_ENDIAN -Wall::-D_REENTRANT:MACOSX:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:macosx:dlfcn:darwin-shared:-fPIC -fno-common:-arch x86_64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
 "debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${ppc32_asm}:osx32:dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+# iPhoneOS/iOS
+"iphoneos-cross","llvm-gcc:-O3 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fomit-frame-pointer -fno-common::-D_REENTRANT:iOS:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+"ios-cross","cc:-O3 -arch armv7 -mios-version-min=7.0.0 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common::-D_REENTRANT:iOS:-Wl,-search_paths_first%:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:armcap.o armv4cpuid_ios.o:bn_asm.o armv4-mont.o armv4-gf2m.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o::ios32:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
+"ios64-cross","cc:-O3 -arch arm64 -mios-version-min=7.0.0 -isysroot \$(CROSS_TOP)/SDKs/\$(CROSS_SDK) -fno-common::-D_REENTRANT:iOS:-Wl,-search_paths_first%:SIXTY_FOUR_BIT_LONG RC4_CHAR -RC4_CHUNK DES_INT DES_UNROLL -BF_PTR:${aarch64_asm}:ios64:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",

 ##### A/UX
 "aux3-gcc","gcc:-O2 -DTERMIO::(unknown):AUX:-lbsd:RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::",
@ -594,12 +611,14 @@ my %table=(
 ##### VxWorks for various targets
 "vxworks-ppc60x","ccppc:-D_REENTRANT -mrtp -mhard-float -mstrict-align -fno-implicit-fp -DPPC32_fp60x -O2 -fstrength-reduce -fno-builtin -fno-strict-aliasing -Wall -DCPU=PPC32 -DTOOL_FAMILY=gnu -DTOOL=gnu -I\$(WIND_BASE)/target/usr/h -I\$(WIND_BASE)/target/usr/h/wrn/coreip:::VXWORKS:-Wl,--defsym,__wrs_rtp_base=0xe0000000 -L \$(WIND_BASE)/target/usr/lib/ppc/PPC32/common:::::",
 "vxworks-ppcgen","ccppc:-D_REENTRANT -mrtp -msoft-float -mstrict-align -O1 -fno-builtin -fno-strict-aliasing -Wall -DCPU=PPC32 -DTOOL_FAMILY=gnu -DTOOL=gnu -I\$(WIND_BASE)/target/usr/h -I\$(WIND_BASE)/target/usr/h/wrn/coreip:::VXWORKS:-Wl,--defsym,__wrs_rtp_base=0xe0000000 -L \$(WIND_BASE)/target/usr/lib/ppc/PPC32/sfcommon:::::",
+"vxworks-ppcgen-kernel","ccppc:-D_REENTRANT -msoft-float -mstrict-align -O1 -fno-builtin -fno-strict-aliasing -Wall -DCPU=PPC32 -DTOOL_FAMILY=gnu -DTOOL=gnu -I\$(WIND_BASE)/target/h -I\$(WIND_BASE)/target/h/wrn/coreip:::VXWORKS::::::",
 "vxworks-ppc405","ccppc:-g -msoft-float -mlongcall -DCPU=PPC405 -I\$(WIND_BASE)/target/h:::VXWORKS:-r:::::",
 "vxworks-ppc750","ccppc:-ansi -nostdinc -DPPC750 -D_REENTRANT -fvolatile -fno-builtin -fno-for-scope -fsigned-char -Wall -msoft-float -mlongcall -DCPU=PPC604 -I\$(WIND_BASE)/target/h \$(DEBUG_FLAG):::VXWORKS:-r:::::",
 "vxworks-ppc750-debug","ccppc:-ansi -nostdinc -DPPC750 -D_REENTRANT -fvolatile -fno-builtin -fno-for-scope -fsigned-char -Wall -msoft-float -mlongcall -DCPU=PPC604 -I\$(WIND_BASE)/target/h -DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DPEDANTIC -DDEBUG_SAFESTACK -DDEBUG -g:::VXWORKS:-r:::::",
 "vxworks-ppc860","ccppc:-nostdinc -msoft-float -DCPU=PPC860 -DNO_STRINGS_H -I\$(WIND_BASE)/target/h:::VXWORKS:-r:::::",
 "vxworks-simlinux","ccpentium:-B\$(WIND_BASE)/host/\$(WIND_HOST_TYPE)/lib/gcc-lib/ -D_VSB_CONFIG_FILE=\"\$(WIND_BASE)/target/lib/h/config/vsbConfig.h\" -DL_ENDIAN -DCPU=SIMLINUX -DTOOL_FAMILY=gnu -DTOOL=gnu -fno-builtin -fno-defer-pop -DNO_STRINGS_H -I\$(WIND_BASE)/target/h -I\$(WIND_BASE)/target/h/wrn/coreip -DOPENSSL_NO_HW_PADLOCK:::VXWORKS:-r::${no_asm}::::::ranlibpentium:",
 "vxworks-mips","ccmips:-mrtp -mips2 -O -G 0 -B\$(WIND_BASE)/host/\$(WIND_HOST_TYPE)/lib/gcc-lib/ -D_VSB_CONFIG_FILE=\"\$(WIND_BASE)/target/lib/h/config/vsbConfig.h\" -DCPU=MIPS32 -msoft-float -mno-branch-likely -DTOOL_FAMILY=gnu -DTOOL=gnu -fno-builtin -fno-defer-pop -DNO_STRINGS_H -I\$(WIND_BASE)/target/usr/h -I\$(WIND_BASE)/target/h/wrn/coreip::-D_REENTRANT:VXWORKS:-Wl,--defsym,__wrs_rtp_base=0xe0000000 -L \$(WIND_BASE)/target/usr/lib/mips/MIPSI32/sfcommon::${mips32_asm}:o32::::::ranlibmips:",
+"vxworks-pentium","ccpentium:-Os -B\$(WIND_BASE)/host/\$(WIND_HOST_TYPE)/lib/gcc-lib/ -D_VSB_CONFIG_FILE=\"\$(WIND_BASE)/target/lib/h/config/vsbConfig.h\" -DL_ENDIAN -DCPU=PENTIUM4 -DTOOL_FAMILY=gnu -DTOOL=gnu -fno-builtin -fno-defer-pop -D_WRS_KERNEL -D_WRS_VX_SMP -I\$(WIND_BASE)/target/h -I\$(WIND_BASE)/target/h/wrn/coreip -DOPENSSL_NO_HW_PADLOCK:::VXWORKS:-r::${no_asm}::::::ranlibpentium:",

 ##### Compaq Non-Stop Kernel (Tandem)
 "tandem-c89","c89:-Ww -D__TANDEM -D_XOPEN_SOURCE -D_XOPEN_SOURCE_EXTENDED=1 -D_TANDEM_SOURCE -DB_ENDIAN::(unknown):::THIRTY_TWO_BIT:::",
@ -608,12 +627,15 @@ my %table=(
 "uClinux-dist","$ENV{'CC'}:\$(CFLAGS)::-D_REENTRANT::\$(LDFLAGS) \$(LDLIBS):BN_LLONG:${no_asm}:$ENV{'LIBSSL_dlfcn'}:linux-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):$ENV{'RANLIB'}::",
 "uClinux-dist64","$ENV{'CC'}:\$(CFLAGS)::-D_REENTRANT::\$(LDFLAGS) \$(LDLIBS):SIXTY_FOUR_BIT_LONG:${no_asm}:$ENV{'LIBSSL_dlfcn'}:linux-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):$ENV{'RANLIB'}::",

+"c64xplus","cl6x:-mv6400+ -o2 -ox -ms -pden -DNO_SYS_TYPES_H -DGETPID_IS_MEANINGLESS -DMD32_REG_T=int -DOPENSSL_SMALL_FOOTPRINT:<c6x.h>::DSPBIOS::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:",
+"c64x","cl6x:-mv6400 -o2 -ox -ms -as -pden -DNO_SYS_TYPES_H -DGETPID_IS_MEANINGLESS -DMD32_REG_T=int -DOPENSSL_SMALL_FOOTPRINT:<c6x.h>::DSPBIOS:::c64xcpuid.o:::aes-c64x.o aes_cbc.o aes_ctr.o:::sha1-c64x.o sha256-c64x.o sha512-c64x.o:::::::::void:",
+
 );

 my @MK1MF_Builds=qw(VC-WIN64I VC-WIN64A
 		    debug-VC-WIN64I debug-VC-WIN64A
 		    VC-NT VC-CE VC-WIN32 debug-VC-WIN32
-		    BC-32 
+		    BC-32 c64xplus c64x
 		    netware-clib netware-clib-bsdsock
 		    netware-libc netware-libc-bsdsock);

@ -906,6 +928,7 @@ EOF
 				}
 			elsif (/^-[^-]/ or /^\+/)
 				{
+				$_ =~ s/%([0-9a-f]{1,2})/chr(hex($1))/gei;
 				$flags.=$_." ";
 				}
 			elsif (/^--prefix=(.*)$/)
@ -1553,7 +1576,7 @@ if ($rmd160_obj =~ /\.o$/)
 	}
 if ($aes_obj =~ /\.o$/)
 	{
-	$cflags.=" -DAES_ASM";
+	 $cflags.=" -DAES_ASM" if ($aes_obj =~ m/\baes\-/);
 	# aes_ctr.o is not a real file, only indication that assembler
 	# module implements AES_ctr32_encrypt...
 	$cflags.=" -DAES_CTR_ASM" if ($aes_obj =~ s/\s*aes_ctr\.o//);
@ -1574,7 +1597,7 @@ else	{
 	$wp_obj="wp_block.o";
 	}
 $cmll_obj=$cmll_enc	unless ($cmll_obj =~ /.o$/);
-if ($modes_obj =~ /ghash/)
+if ($modes_obj =~ /ghash\-/)
 	{
 	$cflags.=" -DGHASH_ASM";
 	}
--- a/Makefile.fips
+++ b/Makefile.fips
@ -186,7 +186,7 @@ SHARED_LDFLAGS=
 GENERAL=        Makefile
 BASENAME=       openssl
 NAME=           $(BASENAME)-$(VERSION)
-TARFILE=        openssl-fips-2.0-test.tar
+TARFILE=        openssl-fips-2.0.tar
 WTARFILE=       $(NAME)-win.tar
 EXHEADER=       e_os2.h
 HEADER=         e_os.h
@ -387,6 +387,8 @@ build_apps:
 	@dir=apps; target=all; $(BUILD_ONE_CMD)
 build_tests:
 	@dir=test; target=fipsexe; $(BUILD_ONE_CMD)
+build_algvs:
+	@dir=test; target=fipsalgvs; $(BUILD_ONE_CMD)
 build_tools:
 	@dir=tools; target=all; $(BUILD_ONE_CMD)

@ -522,8 +524,8 @@ files:
 links:
 	@$(PERL) $(TOP)/util/mkdir-p.pl include/openssl
 	@$(PERL) $(TOP)/util/mklink.pl include/openssl $(EXHEADER)
-	@set -e; dir=fips target=links; $(RECURSIVE_BUILD_CMD)
-	@(cd crypto ; SDIRS='$(LINKDIRS)' $(MAKE) -e links)
+	@set -e; dir=fips target=links; $(BUILD_ONE_CMD)
+	@(cd crypto ; TEST='' SDIRS='$(LINKDIRS)' $(MAKE) -e links)

 gentests:
 	@(cd test && echo "generating dummy tests (if needed)..." && \
@ -536,9 +538,7 @@ dclean:
 test:   tests

 tests:
-	@(cd test && echo "testing..." && \
-	$(CLEARENV) && $(MAKE) -e $(BUILDENV) TOP=.. TESTS='$(TESTS)' OPENSSL_DEBUG_MEMORY=on OPENSSL_CONF=../apps/openssl.cnf tests );
-	OPENSSL_CONF=apps/openssl.cnf util/opensslwrap.sh version -a
+	@echo "Not implemented in FIPS build" ; false

 report:
 	@$(PERL) util/selftest.pl
--- a/README.FIPS
+++ b/README.FIPS
@ -1,4 +1,4 @@
-Preliminary status and build information for FIPS module v2.0
+Preliminary status and build information for FIPS module v2.0 

 NB: if you are cross compiling you now need to use the latest "incore" script
 this can be found at util/incore in the tarballs.
--- a/33
+++ b/33
@ -3465,6 +3465,39 @@ $ranlib       =
 $arflags      = 
 $multilib     = 

+*** iphoneos-cross
+$cc           = llvm-gcc
+$cflags       = -O3 -isysroot $(CROSS_TOP)/SDKs/$(CROSS_SDK) -fomit-frame-pointer -fno-common
+$unistd       = 
+$thread_cflag = -D_REENTRANT
+$sys_id       = iOS
+$lflags       = -Wl,-search_paths_first%
+$bn_ops       = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
+$cpuid_obj    = 
+$bn_obj       = 
+$des_obj      = 
+$aes_obj      = 
+$bf_obj       = 
+$md5_obj      = 
+$sha1_obj     = 
+$cast_obj     = 
+$rc4_obj      = 
+$rmd160_obj   = 
+$rc5_obj      = 
+$wp_obj       = 
+$cmll_obj     = 
+$modes_obj    = 
+$engines_obj  = 
+$perlasm_scheme = void
+$dso_scheme   = dlfcn
+$shared_target= darwin-shared
+$shared_cflag = -fPIC -fno-common
+$shared_ldflag = -dynamiclib
+$shared_extension = .$(SHLIB_MAJOR).$(SHLIB_MINOR).dylib
+$ranlib       = 
+$arflags      = 
+$multilib     = 
+
 *** irix-cc
 $cc           = cc
 $cflags       = -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN
--- a/c6x/do_fips
+++ b/c6x/do_fips
@ -0,0 +1,12 @@
+#!/bin/sh
+
+if ! which cl6x > /dev/null 2>&1; then
+	echo 'fatal: cl6x is not on $PATH'
+	exit 1
+fi
+
+perl Configure ${C6XPLATFORM:-c64xplus} fipscanisteronly no-engine
+perl util/mkfiles.pl > MINFO
+perl util/mk1mf.pl auto > c6x/fips.mak
+make -f c6x/fips.mak
+make -f c6x/fips_algvs.mak
--- a/c6x/env
+++ b/c6x/env
@ -0,0 +1,7 @@
+# MSYS-style PATH
+export PATH=/c/CCStudio_v3.3/c6000/cgtools/bin:/c/Program\ Files/ActivePerl58/bin:$PATH
+
+# Windows-style variables
+export C6X_C_DIR='C:\CCStudio_v3.3\c6000\cgtools\include;C:\CCStudio_v3.3\c6000\cgtools\lib'
+
+export PERL5LIB=C:/CCStudio_v3.3/bin/utilities/ccs_scripting
--- a/c6x/fips_algvs.mak
+++ b/c6x/fips_algvs.mak
@ -0,0 +1,14 @@
+CC=cl6x
+CFLAGS=-mv$${C6XSILICON:-6400+} -o2 -I. -Ic6x/inc -Ifips -DNO_SYS_TYPES_H
+OBJ_D=c6x/tmp
+OUT_D=c6x
+
+all:	$(OUT_D)/fips_algvs.out
+
+$(OBJ_D)/fips_algvs.obj:	test/fips_algvs.c
+	$(CC) --obj_directory=$(OBJ_D) $(CFLAGS) -c $<
+
+$(OUT_D)/fips_algvs.out:	$(OBJ_D)/fips_algvs.obj $(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd
+	$(OUT_D)/fips_standalone_sha1 -verify $(OUT_D)/fipscanister.obj
+	$(CC) -z -o $@ -m $(OUT_D)/fips_algvs.map $< $(OUT_D)/fipscanister.obj c6x/fips_algvs.cmd
+	$(OUT_D)/incore6x $@ || rm $@
--- a/c6x/fips_standalone_sha1
+++ b/c6x/fips_standalone_sha1
@ -0,0 +1,32 @@
+#!/usr/bin/env perl
+#
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+unshift(@INC,$dir);
+require "hmac_sha1.pl";
+
+(!@ARV[0] && -f @ARGV[$#ARGV]) || die "usage: $0 [-verify] file";
+
+$verify=shift	if (@ARGV[0] eq "-verify");
+
+sysopen(FD,@ARGV[0],0) || die "$!";
+binmode(FD);
+
+my $ctx = HMAC->Init("etaonrishdlcupfm");
+
+while (read(FD,$blob,4*1024)) { $ctx->Update($blob); }
+
+close(FD);
+
+my $signature = unpack("H*",$ctx->Final());
+
+print "HMAC-SHA1(@ARGV[0])= $signature\n";
+
+if ($verify) {
+	open(FD,"<@ARGV[0].sha1") || die "$!";
+	$line = <FD>;
+	close(FD);
+	exit(0)	if ($line =~ /HMAC\-SHA1\([^\)]*\)=\s*([0-9a-f]+)/i &&
+				$1 eq $signature);
+	die "signature mismatch";
+}
--- a/c6x/fipscanister.cmd
+++ b/c6x/fipscanister.cmd
@ -0,0 +1,19 @@
+SECTIONS
+{
+    .text:
+    {
+	*(.fips_text:start)
+	*(.text)
+	*(.const:aes_asm)
+	*(.const:sha_asm)
+	*(.const:des_sptrans)
+	*(.switch)
+	*(.fips_text:end)
+    }
+    .const:
+    {
+	*(.fips_const:start)
+	*(.const)
+	*(.fips_const:end)
+    }
+}
--- a/c6x/hmac_sha1.pl
+++ b/c6x/hmac_sha1.pl
@ -0,0 +1,196 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2011 The OpenSSL Project.
+#
+######################################################################
+#
+# SHA1 and HMAC in Perl by <appro@openssl.org>.
+#
+{ package SHA1;
+  use integer;
+
+    {
+    ################################### SHA1 block code generator
+    my @V = ('$A','$B','$C','$D','$E');
+    my $i;
+
+    sub XUpdate {
+      my $ret;
+	$ret="(\$T=\$W[($i-16)%16]^\$W[($i-14)%16]^\$W[($i-8)%16]^\$W[($i-3)%16],\n\t";
+	if ((1<<31)<<1) {
+	    $ret.="    \$W[$i%16]=((\$T<<1)|(\$T>>31))&0xffffffff)\n\t  ";
+	} else {
+	    $ret.="    \$W[$i%16]=(\$T<<1)|((\$T>>31)&1))\n\t  ";
+	}
+    }
+    sub tail {
+      my ($a,$b,$c,$d,$e)=@V;
+      my $ret;
+	if ((1<<31)<<1) {
+	    $ret.="(($a<<5)|($a>>27));\n\t";
+	    $ret.="$b=($b<<30)|($b>>2);	$e&=0xffffffff;	#$b&=0xffffffff;\n\t";
+	} else {
+	    $ret.="(($a<<5)|($a>>27)&0x1f);\n\t";
+	    $ret.="$b=($b<<30)|($b>>2)&0x3fffffff;\n\t";
+	}
+      $ret;
+    }
+    sub BODY_00_15 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=\$W[$i]+0x5a827999+((($c^$d)&$b)^$d)+".tail();
+    }
+    sub BODY_16_19 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0x5a827999+((($c^$d)&$b)^$d)+".tail();
+    }
+    sub BODY_20_39 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0x6ed9eba1+($b^$c^$d)+".tail();
+    }
+    sub BODY_40_59 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0x8f1bbcdc+(($b&$c)|(($b|$c)&$d))+".tail();
+    }
+    sub BODY_60_79 {
+	my ($a,$b,$c,$d,$e)=@V;
+	"$e+=".XUpdate()."+0xca62c1d6+($b^$c^$d)+".tail();
+    }
+
+    my $sha1_impl =
+    'sub block {
+	my $self = @_[0];
+	my @W    = unpack("N16",@_[1]);
+	my ($A,$B,$C,$D,$E,$T) = @{$self->{H}};
+	';
+
+	$sha1_impl.='
+	$A &= 0xffffffff;
+	$B &= 0xffffffff;
+	' if ((1<<31)<<1);
+
+	for($i=0;$i<16;$i++){ $sha1_impl.=BODY_00_15(); unshift(@V,pop(@V)); }
+	for(;$i<20;$i++)    { $sha1_impl.=BODY_16_19(); unshift(@V,pop(@V)); }
+	for(;$i<40;$i++)    { $sha1_impl.=BODY_20_39(); unshift(@V,pop(@V)); }
+	for(;$i<60;$i++)    { $sha1_impl.=BODY_40_59(); unshift(@V,pop(@V)); }
+	for(;$i<80;$i++)    { $sha1_impl.=BODY_60_79(); unshift(@V,pop(@V)); }
+
+	$sha1_impl.='
+	$self->{H}[0]+=$A;	$self->{H}[1]+=$B;	$self->{H}[2]+=$C;
+	$self->{H}[3]+=$D;	$self->{H}[4]+=$E;	}';
+
+    #print $sha1_impl,"\n";
+    eval($sha1_impl);		# generate code
+    }
+
+    sub Init {
+	my $class = shift;	# multiple instances...
+	my $self  = {};
+
+	bless $self,$class;
+	$self->{H} = [0x67452301,0xefcdab89,0x98badcfe,0x10325476,0xc3d2e1f0];
+	$self->{N} = 0;
+	return $self;
+    }
+
+    sub Update {
+	my $self = shift;
+	my $msg;
+
+	foreach $msg (@_) {
+	    my $len  = length($msg);
+	    my $num  = length($self->{buf});
+	    my $off  = 0;
+
+	    $self->{N} += $len;
+
+	    if (($num+$len)<64)
+	    {	$self->{buf} .= $msg; next;	}
+	    elsif ($num)
+	    {	$self->{buf} .= substr($msg,0,($off=64-$num));
+		$self->block($self->{buf});
+	    }
+
+	    while(($off+64) <= $len)
+	    {	$self->block(substr($msg,$off,64));
+		$off += 64;
+	    }
+
+	    $self->{buf} = substr($msg,$off);
+	}
+	return $self;
+    }
+
+    sub Final {
+	my $self = shift;
+	my $num  = length($self->{buf});
+
+	$self->{buf} .= chr(0x80); $num++;
+	if ($num>56)
+	{   $self->{buf} .= chr(0)x(64-$num);
+	    $self->block($self->{buf});
+	    $self->{buf}=undef;
+	    $num=0;
+	}
+	$self->{buf} .= chr(0)x(56-$num);
+	$self->{buf} .= pack("N2",($self->{N}>>29)&0x7,$self->{N}<<3);
+	$self->block($self->{buf});
+
+	return pack("N*",@{$self->{H}});
+    }
+
+    sub Selftest {
+	my $hash;
+
+	$hash=SHA1->Init()->Update('abc')->Final();
+	die "SHA1 test#1" if (unpack("H*",$hash) ne 'a9993e364706816aba3e25717850c26c9cd0d89d');
+
+	$hash=SHA1->Init()->Update('abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq')->Final();
+	die "SHA1 test#2" if (unpack("H*",$hash) ne '84983e441c3bd26ebaae4aa1f95129e5e54670f1');
+
+	#$hash=SHA1->Init()->Update('a'x1000000)->Final();
+	#die "SHA1 test#3" if (unpack("H*",$hash) ne '34aa973cd4c4daa4f61eeb2bdbad27316534016f');
+    }
+}
+
+{ package HMAC;
+
+    sub Init {
+	my $class = shift;
+	my $key   = shift;
+	my $self  = {};
+
+	bless $self,$class;
+
+	if (length($key)>64) {
+	    $key = SHA1->Init()->Update($key)->Final();
+	}
+	$key .= chr(0x00)x(64-length($key));
+
+	my @ikey = map($_^=0x36,unpack("C*",$key));
+	($self->{hash} = SHA1->Init())->Update(pack("C*",@ikey));
+	 $self->{okey} = pack("C*",map($_^=0x36^0x5c,@ikey));
+
+	return $self;
+    }
+
+    sub Update {
+	my $self = shift;
+	$self->{hash}->Update(@_);
+	return $self;
+    }
+
+    sub Final {
+	my $self  = shift;
+	my $ihash = $self->{hash}->Final();
+	return SHA1->Init()->Update($self->{okey},$ihash)->Final();
+    }
+
+    sub Selftest {
+	my $hmac;
+
+	$hmac = HMAC->Init('0123456789:;<=>?@ABC')->Update('Sample #2')->Final();
+	die "HMAC test" if (unpack("H*",$hmac) ne '0922d3405faa3d194f82a45830737d5cc6c75d24');
+    }
+}
+
+1;
--- a/c6x/incore6x
+++ b/c6x/incore6x
@ -0,0 +1,241 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2011 The OpenSSL Project.
+#
+# The script embeds fingerprint into TI-COFF executable object.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+unshift(@INC,$dir);
+require "hmac_sha1.pl";
+
+######################################################################
+#
+# COFF symbol table parser by <appro@openssl.org>. The table entries
+# are extended with offset within executable file...
+#
+{ package COFF;
+  use FileHandle;
+
+    sub dup  { my %copy=map {$_} @_; return \%copy; }
+
+    sub Load {
+	my $class = shift;
+	my $self  = {};
+	my $FD    = FileHandle->new();	# autoclose
+
+	bless $self,$class;
+
+	sysopen($FD,shift,0) or die "$!";
+	binmode($FD);
+
+	#################################################
+	# read and parse COFF header...
+	#
+	read($FD,my $coff,22) or die "$!";
+
+	my %coff_header;
+	@coff_header{version,nsects,date,syms_off,nsyms,opt,flags,magic}=
+		unpack("v2V3v3",$coff);
+
+	$!=42;		# signal fipsld to revert to two-step link
+	die "not TI-COFF file" if ($coff_header{version} != 0xC2);
+
+	my $big_endian = ($coff_header{flags}>>9)&1;	# 0 or 1
+
+	my $strings;
+	my $symsize;
+
+	#################################################
+	# load strings table
+	#
+	seek($FD,$coff_header{syms_off}+18*$coff_header{nsyms},0) or die "$!";
+	read($FD,$strings,4) or die "$!";
+	$symsize = unpack("V",$strings);
+	read($FD,$strings,$symsize,4) or die "$!";
+
+	#################################################
+	# read sections
+	#
+	my $i;
+	my @sections;
+
+	# seek to section headers
+	seek($FD,22+@coff_header{opt},0) or die "$!";
+	for ($i=0;$i<$coff_header{nsects};$i++) {
+	    my %coff_shdr;
+	    my $name;
+
+	    read($FD,my $section,48) or die "$!";
+
+	    @coff_shdr{sh_name,sh_phaddr,sh_vaddr,
+			sh_size,sh_offset,sh_relocs,sh_reserved,
+			sh_relocoff,sh_lines,sh_flags} =
+		unpack("a8V9",$section);
+
+	    $name = $coff_shdr{sh_name};
+	    # see if sh_name is a an offset in $strings
+	    my ($hi,$lo) = unpack("V2",$name);
+	    if ($hi==0 && $lo<$symsize) {
+		$name = substr($strings,$lo,64);
+	    }
+	    $coff_shdr{sh_name} = (split(chr(0),$name))[0];
+
+	    push(@sections,dup(%coff_shdr));
+	}
+
+	#################################################
+	# load symbols table
+	#
+	seek($FD,$coff_header{syms_off},0) or die "$!";
+	for ($i=0;$i<$coff_header{nsyms};$i++) {
+	    my %coff_sym;
+	    my $name;
+
+	    read($FD,my $blob,18) or die "$!";
+
+	    @coff_sym{st_name,st_value,st_shndx,reserved,class,aux} =
+		unpack("a8Vv2C2",$blob);
+
+	    # skip aux entries
+	    if ($coff_sym{aux}) {
+		seek($FD,18*$coff_sym{aux},1) or die "$!";
+		$i+=$coff_sym{aux};
+	    }
+
+	    $name = $coff_sym{st_name};
+	    # see if st_name is a an offset in $strings
+	    my ($hi,$lo) = unpack("V2",$name);
+	    if ($hi==0 && $lo<$symsize) {
+		$name = substr($strings,$lo,64);
+	    }
+	    $coff_sym{st_name} = $name = (split(chr(0),$name))[0];
+
+	    my $st_secn = $coff_sym{st_shndx}-1;
+	    if ($st_secn>=0 && $st_secn<=$#sections
+		&& @sections[$st_secn]->{sh_offset}
+		&& $name =~ m/^_[a-z]+/i) {
+		# synthesize st_offset, ...
+		$coff_sym{st_offset} = $coff_sym{st_value}
+				- @sections[$st_secn]->{sh_vaddr}
+				+ @sections[$st_secn]->{sh_offset};
+		$coff_sym{st_section} = @sections[$st_secn]->{sh_name};
+		# ... and add to lookup table
+		$self->{symbols}{$name} = dup(%coff_sym);
+	    }
+	}
+
+	return $self;
+    }
+
+    sub Lookup {
+	my $self = shift;
+	my $name = shift;
+	return $self->{symbols}{"_$name"};
+    }
+
+    sub Traverse {
+	my $self = shift;
+	my $code = shift;
+
+	if (ref($code) eq 'CODE') {
+	    for (keys(%{$self->{symbols}})) { &$code($self->{symbols}{$_}); }
+	}
+    }
+}
+
+######################################################################
+#
+# main()
+#
+my $legacy_mode;
+
+if ($#ARGV<0 || ($#ARGV>0 && !($legacy_mode=(@ARGV[0] =~ /^\-(dso|exe)$/)))) {
+	print STDERR "usage: $0 [-dso|-exe] ti-coff-binary\n";
+	exit(1);
+}
+
+$exe = COFF->Load(@ARGV[$#ARGV]);
+
+$FIPS_text_start	= $exe->Lookup("FIPS_text_start")		or die;
+$FIPS_text_end		= $exe->Lookup("FIPS_text_end")			or die;
+$FIPS_rodata_start	= $exe->Lookup("FIPS_rodata_start")		or die;
+$FIPS_rodata_end	= $exe->Lookup("FIPS_rodata_end")		or die;
+$FIPS_signature		= $exe->Lookup("FIPS_signature")		or die;
+
+# new cross-compile support
+$FIPS_text_startX	= $exe->Lookup("FIPS_text_startX");
+$FIPS_text_endX		= $exe->Lookup("FIPS_text_endX");
+
+if (!$legacy_mode) {
+    if (!$FIPS_text_startX || !$FIPS_text_endX) {
+	print STDERR "@ARGV[$#ARGV] is not cross-compiler aware.\n";
+	exit(42);	# signal fipsld to revert to two-step link
+    }
+
+    $FINGERPRINT_ascii_value
+			= $exe->Lookup("FINGERPRINT_ascii_value");
+}
+if ($FIPS_text_startX && $FIPS_text_endX) {
+    $FIPS_text_start = $FIPS_text_startX;
+    $FIPS_text_end   = $FIPS_text_endX;
+}
+
+sysopen(FD,@ARGV[$#ARGV],$legacy_mode?0:2) or die "$!";	# 2 is read/write
+binmode(FD);
+
+sub HMAC_Update {
+  my ($hmac,$off,$len) = @_;
+  my $blob;
+
+    seek(FD,$off,0)	or die "$!";
+    read(FD,$blob,$len)	or die "$!";
+    $$hmac->Update($blob);
+}
+
+# fips/fips.c:FIPS_incore_fingerprint's Perl twin
+#
+sub FIPS_incore_fingerprint {
+  my $p1  = $FIPS_text_start->{st_offset};
+  my $p2  = $FIPS_text_end->{st_offset};
+  my $p3  = $FIPS_rodata_start->{st_offset};
+  my $p4  = $FIPS_rodata_end->{st_offset};
+  my $sig = $FIPS_signature->{st_offset};
+  my $ctx = HMAC->Init("etaonrishdlcupfm");
+
+    # detect overlapping regions
+    if ($p1<=$p3 && $p2>=$p3) {
+	$p3 = $p1; $p4 = $p2>$p4?$p2:$p4; $p1 = 0; $p2 = 0;
+    } elsif ($p3<=$p1 && $p4>=$p1) {
+	$p3 = $p3; $p4 = $p2>$p4?$p2:$p4; $p1 = 0; $p2 = 0;
+    }
+
+    if ($p1) {
+	HMAC_Update (\$ctx,$p1,$p2-$p1);
+    }
+
+    if ($sig>=$p3 && $sig<$p4) {
+	# "punch" hole
+	HMAC_Update(\$ctx,$p3,$sig-$p3);
+	$p3 = $sig+20;
+	HMAC_Update(\$ctx,$p3,$p4-$p3);
+    } else {
+	HMAC_Update(\$ctx,$p3,$p4-$p3);
+    }
+
+    return $ctx->Final();
+}
+
+$fingerprint = FIPS_incore_fingerprint();
+
+if ($legacy_mode) {
+    print unpack("H*",$fingerprint);
+} elsif ($FINGERPRINT_ascii_value) {
+    seek(FD,$FINGERPRINT_ascii_value->{st_offset},0)	or die "$!";
+    print FD unpack("H*",$fingerprint)			or die "$!";
+} else {
+    seek(FD,$FIPS_signature->{st_offset},0)		or die "$!";
+    print FD $fingerprint				or die "$!";
+}
+
+close (FD);
--- a/c6x/run6x
+++ b/c6x/run6x
@ -0,0 +1,43 @@
+#!/usr/bin/env perl
+
+$exe  = @ARGV[0];
+$exe .= ".out" if (! -f $exe);
+die if (! -f $exe);
+
+use CCS_SCRIPTING_PERL;
+
+my $studio=new CCS_SCRIPTING_PERL::CCS_Scripting();
+
+$studio->CCSOpenNamed("*","*",1);	# connect to board
+$studio->TargetReset();
+
+print "loading $exe\n";
+$studio->ProgramLoad($exe);
+
+sub write_string {
+    my ($studio,$addr,$str) = @_;
+    my $len = length($str);
+    my $i;
+
+    for ($i=0; $i<$len; $i++) {
+	$studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr+$i,8,vec($str,$i,8));
+    }
+    $studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr+$i,8,0);
+
+    return $i+1;
+}
+
+$addr= $studio->SymbolGetAddress("__c_args");
+printf "setting up __c_args at 0x%X\n",$addr;#\n";
+
+$studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr,32,$#ARGV+1);
+
+for ($i=0,$strings=$addr+($#ARGV+3)*4; $i<=$#ARGV; $i++) {
+    $off = write_string($studio,$strings,@ARGV[$i]);
+    $studio->MemoryWrite($CCS_SCRIPTING_PERL::PAGE_DATA,$addr+4*($i+1),32,$strings);
+    $strings += $off;
+}
+$studio->MemoryWrite($SCC_SCRIPTING_PERL::PAGE_DATA,$addr+4*($i+1),32,0);
+
+print "running...\n";
+$studio->TargetRun();
--- a/c6x/run6x.js
+++ b/c6x/run6x.js
@ -0,0 +1,91 @@
+#!/usr/bin/env dss.sh
+//
+// Debug Server Scripting C6x launcher.
+//
+
+importPackage(Packages.com.ti.debug.engine.scripting);
+importPackage(Packages.com.ti.ccstudio.scripting.environment);
+importPackage(Packages.java.lang);
+
+if (arguments.length == 0) {
+    // Extract script name from eclipse
+    var regex = new RegExp("-dss\\.rhinoArgs\n(.*)");
+    var matches = regex.exec(environment["eclipse.commands"]);
+
+    System.err.println("Usage: " + matches[1] + " executable [args]");
+    System.err.println();
+    System.err.println("You're also required to set CCSTARGETCONFIG " +
+                       "environment variable to appoint");
+    System.err.println("proper .ccxml file, customarily one of " +
+                       "$HOME/ti/CCSTargetConfigurations/*.ccxml");
+    quit(1);
+}
+
+try {
+    var prog = arguments[0];
+    var script = ScriptingEnvironment.instance();
+
+    var debugServer = script.getServer("DebugServer.1");
+
+    // CCSTARGETCONFIG environment variable should point at proper .ccxml,
+    // customarily one of $HOME/ti/CCSTargetConfigurations/*.ccxml.
+    debugServer.setConfig(System.getenv("CCSTARGETCONFIG"));
+
+    var debugSession = debugServer.openSession("*", "*");
+
+    // Redirect GEL output to |prog|.gel file, so that it doesn't clobber
+    // standard output from the program...
+    var dot = prog.lastIndexOf(".");
+    var gel_out = prog + ".gel";
+    if (dot > 0) {
+        gel_out = prog.substr(0,dot) + ".gel";
+    }
+    debugSession.expression.evaluate('GEL_EnableFileOutput("'
+                                      + gel_out + '", 0, 0)');
+
+    debugSession.target.connect();
+
+    // It should be noted that "current working directory" for program
+    // executed on the target system is one where |prog| resides, and
+    // not where script executed [as one would expect]...
+    debugSession.memory.loadProgram(prog, arguments);
+
+    // Pull exit()'s address and set breakpoint, then just execute till
+    // it's reached...
+    var exitAddr = debugSession.symbol.getAddress("exit");
+    debugSession.breakpoint.add(exitAddr);
+
+    while (1) {
+        debugSession.target.run();
+
+        var PC = debugSession.expression.evaluate("PC");
+        if (PC == exitAddr) {
+            break;
+        }
+    }
+
+    // Snatch value passed to exit(), so that it can be passed down to
+    // shell as exit code from this script...
+    var exitCode = debugSession.expression.evaluate("A4");
+
+    // Last run to termination...
+    debugSession.target.run();
+    // Clean up...
+    debugSession.terminate();
+    debugServer.stop();
+
+    // It should be noted that there is kind of a bug in C6x run-time.
+    // Return value from main() is not passed to last implicit exit()
+    // call [as it would on other systems], but instead constant 1 is
+    // passed, which conventionally indicates an error. So that if one
+    // wants to pass specific exit code, or even 0 indicating "success",
+    // one has to call exit() explicitly instead of relying on value
+    // returned by main()...
+    quit(exitCode);
+
+} catch (e) {
+    // We catch everything, because default handler terminates script with
+    // "success" exit code upon exception...
+    System.err.println(e.rhinoException);
+    quit(139);
+}
--- a/93
+++ b/93
@ -134,6 +134,10 @@ case "${SYSTEM}:${RELEASE}:${VERSION}:${MACHINE}" in
 	echo "${MACHINE}-dg-dgux"; exit 0
 	;;

+    ecos:*)
+	echo "${MACHINE}-whatever-ecos"; exit 0
+	;;
+
    HI-UX:*)
 	echo "${MACHINE}-hi-hiux"; exit 0
 	;;
@ -162,6 +166,14 @@ case "${SYSTEM}:${RELEASE}:${VERSION}:${MACHINE}" in
 	echo "mips4-sgi-irix64"; exit 0
 	;;

+    Linux:*:cross:i686)
+	echo "${MACHINE}-cross-linux"; exit 0
+	;;
+
+    Linux:[2-9].*:cross:x86_64)
+	echo "${MACHINE}-cross-linux"; exit 0
+	;;
+
    Linux:[2-9].*)
 	echo "${MACHINE}-whatever-linux2"; exit 0
 	;;
@ -219,7 +231,11 @@ case "${SYSTEM}:${RELEASE}:${VERSION}:${MACHINE}" in
 	;;

    NetBSD:*:*:*386*)
-        echo "`(/usr/sbin/sysctl -n hw.model || /sbin/sysctl -n hw.model) | sed 's,.*\(.\)86-class.*,i\186,'`-whatever-netbsd"; exit 0
+	if [ -z ${CROSS_COMPILE} ]; then
+           echo "`(/usr/sbin/sysctl -n hw.model || /sbin/sysctl -n hw.model) | sed 's,.*\(.\)86-class.*,i\186,'`-whatever-netbsd"; exit 0
+        else
+           echo "${MACHINE}-whatever-netbsd"; exit 0
+	fi
 	;;

    NetBSD:*)
@ -371,6 +387,10 @@ case "${SYSTEM}:${RELEASE}:${VERSION}:${MACHINE}" in
       echo "nsr-tandem-nsk"; exit 0;
       ;;

+    vxworks:kernel*)
+       echo "${MACHINE}-kernel-vxworks"; exit 0;
+       ;;
+
    vxworks*)
       echo "${MACHINE}-whatever-vxworks"; exit 0;
       ;;
@ -535,10 +555,14 @@ case "$GUESSOS" in
        #fi
 	OUT="irix-mips3-$CC"
 	;;
+  mips32be-*-linux2)
+	OUT=linux-mips32be
+	options="$options threads shared zlib-dynamic"
+	;;
  ppc-apple-rhapsody) OUT="rhapsody-ppc-cc" ;;
  ppc-apple-darwin*)
 	ISA64=`(sysctl -n hw.optional.64bitops) 2>/dev/null`
-	if [ "$ISA64" = "1" ]; then
+	if [ "$ISA64" = "1" -a -z "$KERNEL_BITS" ]; then
 	    echo "WARNING! If you wish to build 64-bit library, then you have to"
 	    echo "         invoke './Configure darwin64-ppc-cc' *manually*."
 	    if [ "$TEST" = "false" -a -t 1 ]; then
@ -546,10 +570,14 @@ case "$GUESSOS" in
 	      (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1
 	    fi
 	fi
-	OUT="darwin-ppc-cc" ;;
+	if [ "$ISA64" = "1" -a "$KERNEL_BITS" = "64" ]; then
+	    OUT="darwin64-ppc-cc"
+	else
+	    OUT="darwin-ppc-cc"
+	fi ;;
  i?86-apple-darwin*)
 	ISA64=`(sysctl -n hw.optional.x86_64) 2>/dev/null`
-	if [ "$ISA64" = "1" ]; then
+	if [ "$ISA64" = "1" -a -z "$KERNEL_BITS" ]; then
 	    echo "WARNING! If you wish to build 64-bit library, then you have to"
 	    echo "         invoke './Configure darwin64-x86_64-cc' *manually*."
 	    if [ "$TEST" = "false" -a -t 1 ]; then
@ -557,7 +585,21 @@ case "$GUESSOS" in
 	      (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1
 	    fi
 	fi
-	OUT="darwin-i386-cc" ;;
+	if [ "$ISA64" = "1" -a "$KERNEL_BITS" = "64" ]; then
+	    OUT="darwin64-x86_64-cc"
+	else
+	    OUT="darwin-i386-cc"
+	fi ;;
+  armv6+7-*-iphoneos)
+	options="$options -arch%20armv6 -arch%20armv7"
+	OUT="iphoneos-cross" ;;
+  *-*-iphoneos)
+	options="$options -arch%20${MACHINE}"
+	OUT="iphoneos-cross" ;;
+  armv7-*-ios)
+	OUT="ios-cross" ;;
+  arm64-*-ios*)
+	OUT="ios64-cross" ;;
  alpha-*-linux2)
        ISA=`awk '/cpu model/{print$4;exit(0);}' /proc/cpuinfo`
 	case ${ISA:-generic} in
@ -583,6 +625,7 @@ case "$GUESSOS" in
 	;;
  ppc-*-linux2) OUT="linux-ppc" ;;
  ppc60x-*-vxworks*) OUT="vxworks-ppc60x" ;;
+  ppcgen-kernel-vxworks*) OUT="vxworks-ppcgen-kernel" ;;
  ppcgen-*-vxworks*) OUT="vxworks-ppcgen" ;;
  pentium-*-vxworks*) OUT="vxworks-pentium" ;;
  simlinux-*-vxworks*) OUT="vxworks-simlinux" ;;
@ -627,6 +670,7 @@ case "$GUESSOS" in

 	options="$options -DB_ENDIAN -mschedule=$CPUSCHEDULE -march=$CPUARCH"
 	OUT="linux-generic32" ;;
+  armv[45]*-*-ecos) OUT="ecos-armv4" ;;
  armv[1-3]*-*-linux2) OUT="linux-generic32" ;;
  armv[7-9]*-*-linux2) OUT="linux-armv4"; options="$options -march=armv7-a" ;;
  arm*-*-linux2) OUT="linux-armv4" ;;
@ -661,10 +705,12 @@ case "$GUESSOS" in
        fi ;;
  *-*-linux1) OUT="linux-aout" ;;
  *-*-linux2) OUT="linux-generic32" ;;
+  i686-cross-linux) OUT="linux-i686-cross" ;;
+  *-cross-linux) OUT="linux-x86_64-cross" ;;
  sun4[uv]*-*-solaris2)
 	OUT="solaris-sparcv9-$CC"
 	ISA64=`(isalist) 2>/dev/null | grep sparcv9`
-	if [ "$ISA64" != "" ]; then
+	if [ "$ISA64" != "" -a "$KERNEL_BITS" = "" ]; then
 	    if [ "$CC" = "cc" -a $CCVER -ge 50 ]; then
 		echo "WARNING! If you wish to build 64-bit library, then you have to"
 		echo "         invoke './Configure solaris64-sparcv9-cc' *manually*."
@ -694,13 +740,16 @@ case "$GUESSOS" in
 		fi
 	    fi
 	fi
+	if [ "$ISA64" != "" -a "$KERNEL_BITS" = "64" ]; then
+	    OUT="solaris64-sparcv9-$CC"
+	fi
 	;;
  sun4m-*-solaris2)	OUT="solaris-sparcv8-$CC" ;;
  sun4d-*-solaris2)	OUT="solaris-sparcv8-$CC" ;;
  sun4*-*-solaris2)	OUT="solaris-sparcv7-$CC" ;;
  *86*-*-solaris2)
 	ISA64=`(isalist) 2>/dev/null | grep amd64`
-	if [ "$ISA64" != "" ]; then
+	if [ "$ISA64" != "" -a ${KERNEL_BITS:-64} -eq 64 ]; then
 	    OUT="solaris64-x86_64-$CC"
 	else
 	    OUT="solaris-x86-$CC"
@ -717,17 +766,23 @@ case "$GUESSOS" in
  sparc64-*-*bsd*)	OUT="BSD-sparc64" ;;
  ia64-*-*bsd*)		OUT="BSD-ia64" ;;
  amd64-*-*bsd*)	OUT="BSD-x86_64" ;;
-  *86*-*-*bsd*)		# mimic ld behaviour when it's looking for libc...
-			if [ -L /usr/lib/libc.so ]; then	# [Free|Net]BSD
-			    libc=/usr/lib/libc.so
-			else					# OpenBSD
-			    # ld searches for highest libc.so.* and so do we
-			    libc=`(ls /usr/lib/libc.so.* | tail -1) 2>/dev/null`
-			fi
-			case "`(file -L $libc) 2>/dev/null`" in
-			*ELF*)	OUT="BSD-x86-elf" ;;
-			*)	OUT="BSD-x86"; options="$options no-sse2" ;;
-			esac ;;
+  *86*-*-*bsd*)		if [ -z ${CROSS_COMPILE} ]; then 
+			   # mimic ld behaviour when it's looking for libc...
+			   if [ -L /usr/lib/libc.so ]; then	# [Free|Net]BSD
+			       libc=/usr/lib/libc.so
+			   else					# OpenBSD
+			       # ld searches for highest libc.so.* and so do we
+			       libc=`(ls /usr/lib/libc.so.* | tail -1) 2>/dev/null`
+			   fi
+			   echo "libc = $libc"
+			   case "`(file -L $libc) 2>/dev/null`" in
+			   *ELF*)	OUT="BSD-x86-elf" ;;
+			   *)	OUT="BSD-x86"; options="$options no-sse2" ;;
+			  esac 
+			else
+			   OUT="BSD-x86-elf"
+			fi;;
+  ppc85xx-*-*bsd*)      OUT="BSD-ppc85xx" ;;  # MPC85XX has no hardware FP accelerator
  *-*-*bsd*)		OUT="BSD-generic32" ;;

  *-*-osf)		OUT="osf1-alpha-cc" ;;
@ -821,10 +876,12 @@ case "$GUESSOS" in
  j90-cray-unicos) OUT="cray-j90" ;;
  nsr-tandem-nsk) OUT="tandem-c89" ;;
  beos-*) OUT="$GUESSOS" ;;
+  armv4-*-qnx6) OUT="QNX6-armv4" ;;
  x86pc-*-qnx6) OUT="QNX6-i386" ;;
  *-*-qnx6) OUT="QNX6" ;;
  x86-*-android|i?86-*-android) OUT="android-x86" ;;
  armv[7-9]*-*-android) OUT="android-armv7" ;;
+  aarch64-*-android) OUT="android64-aarch64" ;;
  *) OUT=`echo $GUESSOS | awk -F- '{print $3}'`;;
 esac

--- a/crypto/Makefile
+++ b/crypto/Makefile
@ -87,6 +87,7 @@ ppccpuid.s:	ppccpuid.pl;	$(PERL) ppccpuid.pl $(PERLASM_SCHEME) $@
 pariscid.s:	pariscid.pl;	$(PERL) pariscid.pl $(PERLASM_SCHEME) $@
 alphacpuid.s:	alphacpuid.pl
 	$(PERL) $< | $(CC) -E - | tee $@ > /dev/null
+arm64cpuid.S:	arm64cpuid.pl;	$(PERL) arm64cpuid.pl $(PERLASM_SCHEME) > $@

 subdirs:
 	@target=all; $(RECURSIVE_MAKE)
--- a/crypto/aes/Makefile
+++ b/crypto/aes/Makefile
@ -71,6 +71,8 @@ aes-sparcv9.s: asm/aes-sparcv9.pl

 aes-ppc.s:	asm/aes-ppc.pl
 	$(PERL) asm/aes-ppc.pl $(PERLASM_SCHEME) $@
+aesp8-ppc.s:	asm/aesp8-ppc.pl
+	$(PERL) asm/aesp8-ppc.pl $(PERLASM_SCHEME) $@

 aes-parisc.s:	asm/aes-parisc.pl
 	$(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@
@ -78,6 +80,10 @@ aes-parisc.s:	asm/aes-parisc.pl
 aes-mips.S:	asm/aes-mips.pl
 	$(PERL) asm/aes-mips.pl $(PERLASM_SCHEME) $@

+aesv8-armx.S:	asm/aesv8-armx.pl
+	$(PERL) asm/aesv8-armx.pl $(PERLASM_SCHEME) $@
+aesv8-armx.o:	aesv8-armx.S
+
 # GNU make "catch all"
 aes-%.S:	asm/aes-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
 aes-armv4.o:	aes-armv4.S
--- a/crypto/aes/asm/aes-armv4.pl
+++ b/crypto/aes/asm/aes-armv4.pl
@ -32,8 +32,20 @@
 # Profiler-assisted and platform-specific optimization resulted in 16%
 # improvement on Cortex A8 core and ~21.5 cycles per byte.

-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}

 $s0="r0";
 $s1="r1";
@ -171,7 +183,12 @@ AES_encrypt:
 	stmdb   sp!,{r1,r4-r12,lr}
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
+#ifdef	__APPLE__
+	mov	$tbl,#AES_encrypt-AES_Te
+	sub	$tbl,r3,$tbl			@ Te
+#else
 	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te
+#endif
 #if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
@ -425,7 +442,12 @@ AES_set_encrypt_key:
 	bne	.Labrt

 .Lok:	stmdb   sp!,{r4-r12,lr}
+#ifdef	__APPLE__
+	mov	$tbl,#AES_set_encrypt_key-AES_Te-1024
+	sub	$tbl,r3,$tbl					@ Te4
+#else
 	sub	$tbl,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
+#endif

 	mov	$rounds,r0		@ inp
 	mov	lr,r1			@ bits
@ -886,7 +908,12 @@ AES_decrypt:
 	stmdb   sp!,{r1,r4-r12,lr}
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
+#ifdef	__APPLE__
+	mov	$tbl,#AES_decrypt-AES_Td
+	sub	$tbl,r3,$tbl				@ Td
+#else
 	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td
+#endif
 #if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
--- a/crypto/aes/asm/aes-c64x.pl
+++ b/crypto/aes/asm/aes-c64x.pl
--- a/crypto/aes/asm/aes-c64xplus.pl
+++ b/crypto/aes/asm/aes-c64xplus.pl
--- a/crypto/aes/asm/aes-mips.pl
+++ b/crypto/aes/asm/aes-mips.pl
@ -47,7 +47,7 @@
 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
 #
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64

 if ($flavour =~ /64|n32/i) {
 	$PTR_ADD="dadd";	# incidentally works even on n32
@ -70,7 +70,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
 #
 ######################################################################

-$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;

 for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
 open STDOUT,">$output";
@ -89,7 +89,7 @@ $code.=<<___;
 # include <openssl/fipssyms.h>
 #endif

-#if !defined(__vxworks) || defined(__pic__)
+#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
 .option	pic2
 #endif
 .set	noat
--- a/crypto/aes/asm/aes-ppc.pl
+++ b/crypto/aes/asm/aes-ppc.pl
@ -548,7 +548,7 @@ Lenc_loop:
 	xor	$s2,$t2,$acc14
 	xor	$s3,$t3,$acc15
 	addi	$key,$key,16
-	bdnz-	Lenc_loop
+	bdnz	Lenc_loop

 	addi	$Tbl2,$Tbl0,2048
 	nop
@ -982,7 +982,7 @@ Ldec_loop:
 	xor	$s2,$t2,$acc14
 	xor	$s3,$t3,$acc15
 	addi	$key,$key,16
-	bdnz-	Ldec_loop
+	bdnz	Ldec_loop

 	addi	$Tbl2,$Tbl0,2048
 	nop
--- a/crypto/aes/asm/aesp8-ppc.pl
+++ b/crypto/aes/asm/aesp8-ppc.pl
--- a/crypto/aes/asm/aesv8-armx.pl
+++ b/crypto/aes/asm/aesv8-armx.pl
@ -0,0 +1,968 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for ARMv8 AES instructions. The
+# module is endian-agnostic in sense that it supports both big- and
+# little-endian cases. As does it support both 32- and 64-bit modes
+# of operation. Latter is achieved by limiting amount of utilized
+# registers to 16, which implies additional NEON load and integer
+# instructions. This has no effect on mighty Apple A7, where results
+# are literally equal to the theoretical estimates based on AES
+# instruction latencies and issue rates. On Cortex-A53, an in-order
+# execution core, this costs up to 10-15%, which is partially
+# compensated by implementing dedicated code path for 128-bit
+# CBC encrypt case. On Cortex-A57 parallelizable mode performance
+# seems to be limited by sheer amount of NEON instructions...
+#
+# Performance in cycles per byte processed with 128-bit key:
+#
+#		CBC enc		CBC dec		CTR
+# Apple A7	2.39		1.20		1.20
+# Cortex-A53	2.45		1.87		1.94
+# Cortex-A57	3.64		1.34		1.32
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$prefix="aes_v8";
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_ARCH__>=7
+.text
+___
+$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
+$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
+
+# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
+# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
+# maintain both 32- and 64-bit codes within single module and
+# transliterate common code to either flavour with regex vodoo.
+#
+{{{
+my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
+my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
+	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
+
+
+$code.=<<___;
+.align	5
+.Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.globl	${prefix}_set_encrypt_key
+.type	${prefix}_set_encrypt_key,%function
+.align	5
+${prefix}_set_encrypt_key:
+.Lenc_key:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+___
+$code.=<<___;
+	mov	$ptr,#-1
+	cmp	$inp,#0
+	b.eq	.Lenc_key_abort
+	cmp	$out,#0
+	b.eq	.Lenc_key_abort
+	mov	$ptr,#-2
+	cmp	$bits,#128
+	b.lt	.Lenc_key_abort
+	cmp	$bits,#256
+	b.gt	.Lenc_key_abort
+	tst	$bits,#0x3f
+	b.ne	.Lenc_key_abort
+
+	adr	$ptr,.Lrcon
+	cmp	$bits,#192
+
+	veor	$zero,$zero,$zero
+	vld1.8	{$in0},[$inp],#16
+	mov	$bits,#8		// reuse $bits
+	vld1.32	{$rcon,$mask},[$ptr],#32
+
+	b.lt	.Loop128
+	b.eq	.L192
+	b	.L256
+
+.align	4
+.Loop128:
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+	subs	$bits,$bits,#1
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+	b.ne	.Loop128
+
+	vld1.32	{$rcon},[$ptr]
+
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+
+	vtbl.8	$key,{$in0},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in0},[$out],#16
+	aese	$key,$zero
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	veor	$in0,$in0,$key
+	vst1.32	{$in0},[$out]
+	add	$out,$out,#0x50
+
+	mov	$rounds,#10
+	b	.Ldone
+
+.align	4
+.L192:
+	vld1.8	{$in1},[$inp],#8
+	vmov.i8	$key,#8			// borrow $key
+	vst1.32	{$in0},[$out],#16
+	vsub.i8	$mask,$mask,$key	// adjust the mask
+
+.Loop192:
+	vtbl.8	$key,{$in1},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in1},[$out],#8
+	aese	$key,$zero
+	subs	$bits,$bits,#1
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+
+	vdup.32	$tmp,${in0}[3]
+	veor	$tmp,$tmp,$in1
+	 veor	$key,$key,$rcon
+	vext.8	$in1,$zero,$in1,#12
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in1,$in1,$tmp
+	veor	$in0,$in0,$key
+	veor	$in1,$in1,$key
+	vst1.32	{$in0},[$out],#16
+	b.ne	.Loop192
+
+	mov	$rounds,#12
+	add	$out,$out,#0x20
+	b	.Ldone
+
+.align	4
+.L256:
+	vld1.8	{$in1},[$inp]
+	mov	$bits,#7
+	mov	$rounds,#14
+	vst1.32	{$in0},[$out],#16
+
+.Loop256:
+	vtbl.8	$key,{$in1},$mask
+	vext.8	$tmp,$zero,$in0,#12
+	vst1.32	{$in1},[$out],#16
+	aese	$key,$zero
+	subs	$bits,$bits,#1
+
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in0,$in0,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	 veor	$key,$key,$rcon
+	veor	$in0,$in0,$tmp
+	vshl.u8	$rcon,$rcon,#1
+	veor	$in0,$in0,$key
+	vst1.32	{$in0},[$out],#16
+	b.eq	.Ldone
+
+	vdup.32	$key,${in0}[3]		// just splat
+	vext.8	$tmp,$zero,$in1,#12
+	aese	$key,$zero
+
+	veor	$in1,$in1,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in1,$in1,$tmp
+	vext.8	$tmp,$zero,$tmp,#12
+	veor	$in1,$in1,$tmp
+
+	veor	$in1,$in1,$key
+	b	.Loop256
+
+.Ldone:
+	str	$rounds,[$out]
+	mov	$ptr,#0
+
+.Lenc_key_abort:
+	mov	x0,$ptr			// return value
+	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
+	ret
+.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+
+.globl	${prefix}_set_decrypt_key
+.type	${prefix}_set_decrypt_key,%function
+.align	5
+${prefix}_set_decrypt_key:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+___
+$code.=<<___	if ($flavour !~ /64/);
+	stmdb	sp!,{r4,lr}
+___
+$code.=<<___;
+	bl	.Lenc_key
+
+	cmp	x0,#0
+	b.ne	.Ldec_key_abort
+
+	sub	$out,$out,#240		// restore original $out
+	mov	x4,#-16
+	add	$inp,$out,x12,lsl#4	// end of key schedule
+
+	vld1.32	{v0.16b},[$out]
+	vld1.32	{v1.16b},[$inp]
+	vst1.32	{v0.16b},[$inp],x4
+	vst1.32	{v1.16b},[$out],#16
+
+.Loop_imc:
+	vld1.32	{v0.16b},[$out]
+	vld1.32	{v1.16b},[$inp]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	vst1.32	{v0.16b},[$inp],x4
+	vst1.32	{v1.16b},[$out],#16
+	cmp	$inp,$out
+	b.hi	.Loop_imc
+
+	vld1.32	{v0.16b},[$out]
+	aesimc	v0.16b,v0.16b
+	vst1.32	{v0.16b},[$inp]
+
+	eor	x0,x0,x0		// return value
+.Ldec_key_abort:
+___
+$code.=<<___	if ($flavour !~ /64/);
+	ldmia	sp!,{r4,pc}
+___
+$code.=<<___	if ($flavour =~ /64/);
+	ldp	x29,x30,[sp],#16
+	ret
+___
+$code.=<<___;
+.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
+my ($inp,$out,$key)=map("x$_",(0..2));
+my $rounds="w3";
+my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
+
+$code.=<<___;
+.globl	${prefix}_${dir}crypt
+.type	${prefix}_${dir}crypt,%function
+.align	5
+${prefix}_${dir}crypt:
+	ldr	$rounds,[$key,#240]
+	vld1.32	{$rndkey0},[$key],#16
+	vld1.8	{$inout},[$inp]
+	sub	$rounds,$rounds,#2
+	vld1.32	{$rndkey1},[$key],#16
+
+.Loop_${dir}c:
+	aes$e	$inout,$rndkey0
+	vld1.32	{$rndkey0},[$key],#16
+	aes$mc	$inout,$inout
+	subs	$rounds,$rounds,#2
+	aes$e	$inout,$rndkey1
+	vld1.32	{$rndkey1},[$key],#16
+	aes$mc	$inout,$inout
+	b.gt	.Loop_${dir}c
+
+	aes$e	$inout,$rndkey0
+	vld1.32	{$rndkey0},[$key]
+	aes$mc	$inout,$inout
+	aes$e	$inout,$rndkey1
+	veor	$inout,$inout,$rndkey0
+
+	vst1.8	{$inout},[$out]
+	ret
+.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
+my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+### q8-q15	preloaded key schedule
+
+$code.=<<___;
+.globl	${prefix}_cbc_encrypt
+.type	${prefix}_cbc_encrypt,%function
+.align	5
+${prefix}_cbc_encrypt:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+___
+$code.=<<___	if ($flavour !~ /64/);
+	mov	ip,sp
+	stmdb	sp!,{r4-r8,lr}
+	vstmdb	sp!,{d8-d15}            @ ABI specification says so
+	ldmia	ip,{r4-r5}		@ load remaining args
+___
+$code.=<<___;
+	subs	$len,$len,#16
+	mov	$step,#16
+	b.lo	.Lcbc_abort
+	cclr	$step,eq
+
+	cmp	$enc,#0			// en- or decrypting?
+	ldr	$rounds,[$key,#240]
+	and	$len,$len,#-16
+	vld1.8	{$ivec},[$ivp]
+	vld1.8	{$dat},[$inp],$step
+
+	vld1.32	{q8-q9},[$key]		// load key schedule...
+	sub	$rounds,$rounds,#6
+	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
+	sub	$rounds,$rounds,#2
+	vld1.32	{q10-q11},[$key_],#32
+	vld1.32	{q12-q13},[$key_],#32
+	vld1.32	{q14-q15},[$key_],#32
+	vld1.32	{$rndlast},[$key_]
+
+	add	$key_,$key,#32
+	mov	$cnt,$rounds
+	b.eq	.Lcbc_dec
+
+	cmp	$rounds,#2
+	veor	$dat,$dat,$ivec
+	veor	$rndzero_n_last,q8,$rndlast
+	b.eq	.Lcbc_enc128
+
+.Loop_cbc_enc:
+	aese	$dat,q8
+	vld1.32	{q8},[$key_],#16
+	aesmc	$dat,$dat
+	subs	$cnt,$cnt,#2
+	aese	$dat,q9
+	vld1.32	{q9},[$key_],#16
+	aesmc	$dat,$dat
+	b.gt	.Loop_cbc_enc
+
+	aese	$dat,q8
+	aesmc	$dat,$dat
+	 subs	$len,$len,#16
+	aese	$dat,q9
+	aesmc	$dat,$dat
+	 cclr	$step,eq
+	aese	$dat,q10
+	aesmc	$dat,$dat
+	 add	$key_,$key,#16
+	aese	$dat,q11
+	aesmc	$dat,$dat
+	 vld1.8	{q8},[$inp],$step
+	aese	$dat,q12
+	aesmc	$dat,$dat
+	 veor	q8,q8,$rndzero_n_last
+	aese	$dat,q13
+	aesmc	$dat,$dat
+	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
+	aese	$dat,q14
+	aesmc	$dat,$dat
+	aese	$dat,q15
+
+	 mov	$cnt,$rounds
+	veor	$ivec,$dat,$rndlast
+	vst1.8	{$ivec},[$out],#16
+	b.hs	.Loop_cbc_enc
+
+	b	.Lcbc_done
+
+.align	5
+.Lcbc_enc128:
+	vld1.32	{$in0-$in1},[$key_]
+	aese	$dat,q8
+	aesmc	$dat,$dat
+	b	.Lenter_cbc_enc128
+.Loop_cbc_enc128:
+	aese	$dat,q8
+	aesmc	$dat,$dat
+	 vst1.8	{$ivec},[$out],#16
+.Lenter_cbc_enc128:
+	aese	$dat,q9
+	aesmc	$dat,$dat
+	 subs	$len,$len,#16
+	aese	$dat,$in0
+	aesmc	$dat,$dat
+	 cclr	$step,eq
+	aese	$dat,$in1
+	aesmc	$dat,$dat
+	aese	$dat,q10
+	aesmc	$dat,$dat
+	aese	$dat,q11
+	aesmc	$dat,$dat
+	 vld1.8	{q8},[$inp],$step
+	aese	$dat,q12
+	aesmc	$dat,$dat
+	aese	$dat,q13
+	aesmc	$dat,$dat
+	aese	$dat,q14
+	aesmc	$dat,$dat
+	 veor	q8,q8,$rndzero_n_last
+	aese	$dat,q15
+	veor	$ivec,$dat,$rndlast
+	b.hs	.Loop_cbc_enc128
+
+	vst1.8	{$ivec},[$out],#16
+	b	.Lcbc_done
+___
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+$code.=<<___;
+.align	5
+.Lcbc_dec:
+	vld1.8	{$dat2},[$inp],#16
+	subs	$len,$len,#32		// bias
+	add	$cnt,$rounds,#2
+	vorr	$in1,$dat,$dat
+	vorr	$dat1,$dat,$dat
+	vorr	$in2,$dat2,$dat2
+	b.lo	.Lcbc_dec_tail
+
+	vorr	$dat1,$dat2,$dat2
+	vld1.8	{$dat2},[$inp],#16
+	vorr	$in0,$dat,$dat
+	vorr	$in1,$dat1,$dat1
+	vorr	$in2,$dat2,$dat2
+
+.Loop3x_cbc_dec:
+	aesd	$dat0,q8
+	aesd	$dat1,q8
+	aesd	$dat2,q8
+	vld1.32	{q8},[$key_],#16
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	subs	$cnt,$cnt,#2
+	aesd	$dat0,q9
+	aesd	$dat1,q9
+	aesd	$dat2,q9
+	vld1.32	{q9},[$key_],#16
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	b.gt	.Loop3x_cbc_dec
+
+	aesd	$dat0,q8
+	aesd	$dat1,q8
+	aesd	$dat2,q8
+	 veor	$tmp0,$ivec,$rndlast
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 veor	$tmp1,$in0,$rndlast
+	aesd	$dat0,q9
+	aesd	$dat1,q9
+	aesd	$dat2,q9
+	 veor	$tmp2,$in1,$rndlast
+	 subs	$len,$len,#0x30
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 vorr	$ivec,$in2,$in2
+	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
+	aesd	$dat0,q12
+	aesd	$dat1,q12
+	aesd	$dat2,q12
+	 add	$inp,$inp,x6		// $inp is adjusted in such way that
+					// at exit from the loop $dat1-$dat2
+					// are loaded with last "words"
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 mov	$key_,$key
+	aesd	$dat0,q13
+	aesd	$dat1,q13
+	aesd	$dat2,q13
+	 vld1.8	{$in0},[$inp],#16
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 vld1.8	{$in1},[$inp],#16
+	aesd	$dat0,q14
+	aesd	$dat1,q14
+	aesd	$dat2,q14
+	 vld1.8	{$in2},[$inp],#16
+	aesimc	$dat0,$dat0
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
+	aesd	$dat0,q15
+	aesd	$dat1,q15
+	aesd	$dat2,q15
+
+	 add	$cnt,$rounds,#2
+	veor	$tmp0,$tmp0,$dat0
+	veor	$tmp1,$tmp1,$dat1
+	veor	$dat2,$dat2,$tmp2
+	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
+	 vorr	$dat0,$in0,$in0
+	vst1.8	{$tmp0},[$out],#16
+	 vorr	$dat1,$in1,$in1
+	vst1.8	{$tmp1},[$out],#16
+	vst1.8	{$dat2},[$out],#16
+	 vorr	$dat2,$in2,$in2
+	b.hs	.Loop3x_cbc_dec
+
+	cmn	$len,#0x30
+	b.eq	.Lcbc_done
+	nop
+
+.Lcbc_dec_tail:
+	aesd	$dat1,q8
+	aesd	$dat2,q8
+	vld1.32	{q8},[$key_],#16
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	subs	$cnt,$cnt,#2
+	aesd	$dat1,q9
+	aesd	$dat2,q9
+	vld1.32	{q9},[$key_],#16
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	b.gt	.Lcbc_dec_tail
+
+	aesd	$dat1,q8
+	aesd	$dat2,q8
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	aesd	$dat1,q9
+	aesd	$dat2,q9
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	aesd	$dat1,q12
+	aesd	$dat2,q12
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 cmn	$len,#0x20
+	aesd	$dat1,q13
+	aesd	$dat2,q13
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 veor	$tmp1,$ivec,$rndlast
+	aesd	$dat1,q14
+	aesd	$dat2,q14
+	aesimc	$dat1,$dat1
+	aesimc	$dat2,$dat2
+	 veor	$tmp2,$in1,$rndlast
+	aesd	$dat1,q15
+	aesd	$dat2,q15
+	b.eq	.Lcbc_dec_one
+	veor	$tmp1,$tmp1,$dat1
+	veor	$tmp2,$tmp2,$dat2
+	 vorr	$ivec,$in2,$in2
+	vst1.8	{$tmp1},[$out],#16
+	vst1.8	{$tmp2},[$out],#16
+	b	.Lcbc_done
+
+.Lcbc_dec_one:
+	veor	$tmp1,$tmp1,$dat2
+	 vorr	$ivec,$in2,$in2
+	vst1.8	{$tmp1},[$out],#16
+
+.Lcbc_done:
+	vst1.8	{$ivec},[$ivp]
+.Lcbc_abort:
+___
+}
+$code.=<<___	if ($flavour !~ /64/);
+	vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4-r8,pc}
+___
+$code.=<<___	if ($flavour =~ /64/);
+	ldr	x29,[sp],#16
+	ret
+___
+$code.=<<___;
+.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_)=("w5","w6","x7");
+my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
+my $step="x12";		# aliases with $tctr2
+
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15	preloaded key schedule
+
+$code.=<<___;
+.globl	${prefix}_ctr32_encrypt_blocks
+.type	${prefix}_ctr32_encrypt_blocks,%function
+.align	5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___	if ($flavour =~ /64/);
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+___
+$code.=<<___	if ($flavour !~ /64/);
+	mov		ip,sp
+	stmdb		sp!,{r4-r10,lr}
+	vstmdb		sp!,{d8-d15}            @ ABI specification says so
+	ldr		r4, [ip]		@ load remaining arg
+___
+$code.=<<___;
+	ldr		$rounds,[$key,#240]
+
+	ldr		$ctr, [$ivp, #12]
+	vld1.32		{$dat0},[$ivp]
+
+	vld1.32		{q8-q9},[$key]		// load key schedule...
+	sub		$rounds,$rounds,#4
+	mov		$step,#16
+	cmp		$len,#2
+	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
+	sub		$rounds,$rounds,#2
+	vld1.32		{q12-q13},[$key_],#32
+	vld1.32		{q14-q15},[$key_],#32
+	vld1.32		{$rndlast},[$key_]
+	add		$key_,$key,#32
+	mov		$cnt,$rounds
+	cclr		$step,lo
+#ifndef __ARMEB__
+	rev		$ctr, $ctr
+#endif
+	vorr		$dat1,$dat0,$dat0
+	add		$tctr1, $ctr, #1
+	vorr		$dat2,$dat0,$dat0
+	add		$ctr, $ctr, #2
+	vorr		$ivec,$dat0,$dat0
+	rev		$tctr1, $tctr1
+	vmov.32		${dat1}[3],$tctr1
+	b.ls		.Lctr32_tail
+	rev		$tctr2, $ctr
+	sub		$len,$len,#3		// bias
+	vmov.32		${dat2}[3],$tctr2
+	b		.Loop3x_ctr32
+
+.align	4
+.Loop3x_ctr32:
+	aese		$dat0,q8
+	aese		$dat1,q8
+	aese		$dat2,q8
+	vld1.32		{q8},[$key_],#16
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	aesmc		$dat2,$dat2
+	subs		$cnt,$cnt,#2
+	aese		$dat0,q9
+	aese		$dat1,q9
+	aese		$dat2,q9
+	vld1.32		{q9},[$key_],#16
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	aesmc		$dat2,$dat2
+	b.gt		.Loop3x_ctr32
+
+	aese		$dat0,q8
+	aese		$dat1,q8
+	aese		$dat2,q8
+	 mov		$key_,$key
+	aesmc		$tmp0,$dat0
+	 vld1.8		{$in0},[$inp],#16
+	aesmc		$tmp1,$dat1
+	aesmc		$dat2,$dat2
+	 vorr		$dat0,$ivec,$ivec
+	aese		$tmp0,q9
+	 vld1.8		{$in1},[$inp],#16
+	aese		$tmp1,q9
+	aese		$dat2,q9
+	 vorr		$dat1,$ivec,$ivec
+	aesmc		$tmp0,$tmp0
+	 vld1.8		{$in2},[$inp],#16
+	aesmc		$tmp1,$tmp1
+	aesmc		$tmp2,$dat2
+	 vorr		$dat2,$ivec,$ivec
+	 add		$tctr0,$ctr,#1
+	aese		$tmp0,q12
+	aese		$tmp1,q12
+	aese		$tmp2,q12
+	 veor		$in0,$in0,$rndlast
+	 add		$tctr1,$ctr,#2
+	aesmc		$tmp0,$tmp0
+	aesmc		$tmp1,$tmp1
+	aesmc		$tmp2,$tmp2
+	 veor		$in1,$in1,$rndlast
+	 add		$ctr,$ctr,#3
+	aese		$tmp0,q13
+	aese		$tmp1,q13
+	aese		$tmp2,q13
+	 veor		$in2,$in2,$rndlast
+	 rev		$tctr0,$tctr0
+	aesmc		$tmp0,$tmp0
+	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
+	aesmc		$tmp1,$tmp1
+	aesmc		$tmp2,$tmp2
+	 vmov.32	${dat0}[3], $tctr0
+	 rev		$tctr1,$tctr1
+	aese		$tmp0,q14
+	aese		$tmp1,q14
+	aese		$tmp2,q14
+	 vmov.32	${dat1}[3], $tctr1
+	 rev		$tctr2,$ctr
+	aesmc		$tmp0,$tmp0
+	aesmc		$tmp1,$tmp1
+	aesmc		$tmp2,$tmp2
+	 vmov.32	${dat2}[3], $tctr2
+	 subs		$len,$len,#3
+	aese		$tmp0,q15
+	aese		$tmp1,q15
+	aese		$tmp2,q15
+
+	 mov		$cnt,$rounds
+	veor		$in0,$in0,$tmp0
+	veor		$in1,$in1,$tmp1
+	veor		$in2,$in2,$tmp2
+	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
+	vst1.8		{$in0},[$out],#16
+	vst1.8		{$in1},[$out],#16
+	vst1.8		{$in2},[$out],#16
+	b.hs		.Loop3x_ctr32
+
+	adds		$len,$len,#3
+	b.eq		.Lctr32_done
+	cmp		$len,#1
+	mov		$step,#16
+	cclr		$step,eq
+
+.Lctr32_tail:
+	aese		$dat0,q8
+	aese		$dat1,q8
+	vld1.32		{q8},[$key_],#16
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	subs		$cnt,$cnt,#2
+	aese		$dat0,q9
+	aese		$dat1,q9
+	vld1.32		{q9},[$key_],#16
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	b.gt		.Lctr32_tail
+
+	aese		$dat0,q8
+	aese		$dat1,q8
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	aese		$dat0,q9
+	aese		$dat1,q9
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	 vld1.8		{$in0},[$inp],$step
+	aese		$dat0,q12
+	aese		$dat1,q12
+	 vld1.8		{$in1},[$inp]
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	aese		$dat0,q13
+	aese		$dat1,q13
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	aese		$dat0,q14
+	aese		$dat1,q14
+	 veor		$in0,$in0,$rndlast
+	aesmc		$dat0,$dat0
+	aesmc		$dat1,$dat1
+	 veor		$in1,$in1,$rndlast
+	aese		$dat0,q15
+	aese		$dat1,q15
+
+	cmp		$len,#1
+	veor		$in0,$in0,$dat0
+	veor		$in1,$in1,$dat1
+	vst1.8		{$in0},[$out],#16
+	b.eq		.Lctr32_done
+	vst1.8		{$in1},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___	if ($flavour !~ /64/);
+	vldmia		sp!,{d8-d15}
+	ldmia		sp!,{r4-r10,pc}
+___
+$code.=<<___	if ($flavour =~ /64/);
+	ldr		x29,[sp],#16
+	ret
+___
+$code.=<<___;
+.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+$code.=<<___;
+#endif
+___
+########################################
+if ($flavour =~ /64/) {			######## 64-bit code
+    my %opcode = (
+	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
+	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
+
+    local *unaes = sub {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5),
+			$mnemonic,$arg;
+    };
+
+    foreach(split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;			# old->new style commentary
+
+	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
+	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
+	s/vext\.8/ext/o		or
+	s/vrev32\.8/rev32/o	or
+	s/vtst\.8/cmtst/o	or
+	s/vshr/ushr/o		or
+	s/^(\s+)v/$1/o		or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	# fix up remainig legacy suffixes
+	s/\.[ui]?8//o;
+	m/\],#8/o and s/\.16b/\.8b/go;
+	s/\.[ui]?32//o and s/\.16b/\.4s/go;
+	s/\.[ui]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    my %opcode = (
+	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
+	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
+
+    local *unaes = sub {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
+	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+					 |(($2&7)<<1) |(($2&8)<<2);
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    };
+
+    sub unvtbl {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
+	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
+		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;	
+    }
+
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;	
+    }
+
+    sub unvmov32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;	
+    }
+
+    foreach(split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+	s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remainig new-style suffixes
+	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
+	s/\],#[0-9]+/]!/o;
+
+	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
+	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
+	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
+	s/^(\s+)b\./$1b/o				or
+	s/^(\s+)mov\./$1mov/o				or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+	print $_,"\n";
+    }
+}
+
+close STDOUT;
--- a/crypto/arm64cpuid.pl
+++ b/crypto/arm64cpuid.pl
@ -0,0 +1,68 @@
+#!/usr/bin/env perl
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+.arch	armv8-a+crypto
+
+.align	5
+.globl	_armv7_neon_probe
+.type	_armv7_neon_probe,%function
+_armv7_neon_probe:
+	orr	v15.16b, v15.16b, v15.16b
+	ret
+.size	_armv7_neon_probe,.-_armv7_neon_probe
+
+.globl	_armv7_tick
+.type	_armv7_tick,%function
+_armv7_tick:
+#ifdef	__APPLE__
+	mrs	x0, CNTPCT_EL0
+#else
+	mrs	x0, CNTVCT_EL0
+#endif
+	ret
+.size	_armv7_tick,.-_armv7_tick
+
+.globl	_armv8_aes_probe
+.type	_armv8_aes_probe,%function
+_armv8_aes_probe:
+	aese	v0.16b, v0.16b
+	ret
+.size	_armv8_aes_probe,.-_armv8_aes_probe
+
+.globl	_armv8_sha1_probe
+.type	_armv8_sha1_probe,%function
+_armv8_sha1_probe:
+	sha1h	s0, s0
+	ret
+.size	_armv8_sha1_probe,.-_armv8_sha1_probe
+
+.globl	_armv8_sha256_probe
+.type	_armv8_sha256_probe,%function
+_armv8_sha256_probe:
+	sha256su0	v0.4s, v0.4s
+	ret
+.size	_armv8_sha256_probe,.-_armv8_sha256_probe
+.globl	_armv8_pmull_probe
+.type	_armv8_pmull_probe,%function
+_armv8_pmull_probe:
+	pmull	v0.1q, v0.1d, v0.1d
+	ret
+.size	_armv8_pmull_probe,.-_armv8_pmull_probe
+___
+
+print $code;
+close STDOUT;
--- a/crypto/arm_arch.h
+++ b/crypto/arm_arch.h
@ -10,13 +10,22 @@
 #   define __ARMEL__
 #  endif
 # elif defined(__GNUC__)
+#  if  defined(__aarch64__)
+#   define __ARM_ARCH__ 8
+#   if __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
+#    define __ARMEB__
+#   else
+#    define __ARMEL__
+#   endif
  /*
   * Why doesn't gcc define __ARM_ARCH__? Instead it defines
   * bunch of below macros. See all_architectires[] table in
   * gcc/config/arm/arm.c. On a side note it defines
   * __ARMEL__/__ARMEB__ for little-/big-endian.
   */
-#  if	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
+#  elif defined(__ARM_ARCH_8A__)
+#    define __ARM_ARCH__ 8
+#  elif	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
 	defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)	|| \
 	defined(__ARM_ARCH_7EM__)
 #   define __ARM_ARCH__ 7
@ -42,10 +51,14 @@

 #if !__ASSEMBLER__
 extern unsigned int OPENSSL_armcap_P;
+#endif
                                     
 #define ARMV7_NEON      (1<<0)
 #define ARMV7_TICK      (1<<1)
-#endif
+#define ARMV8_AES       (1<<2)
+#define ARMV8_SHA1      (1<<3)
+#define ARMV8_SHA256    (1<<4)
+#define ARMV8_PMULL     (1<<5)

 #endif
 #endif
--- a/crypto/armcap.c
+++ b/crypto/armcap.c
@ -20,6 +20,10 @@ static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
 */
 void _armv7_neon_probe(void);
 unsigned int _armv7_tick(void);
+void _armv8_aes_probe(void);
+void _armv8_sha1_probe(void);
+void _armv8_sha256_probe(void);
+void _armv8_pmull_probe(void);

 unsigned int OPENSSL_rdtsc(void)
 	{
@ -30,7 +34,7 @@ unsigned int OPENSSL_rdtsc(void)
 	}

 #if defined(__GNUC__) && __GNUC__>=2
-void OPENSSL_cpuid_setup(void) __attribute__((constructor))
+void OPENSSL_cpuid_setup(void) __attribute__((constructor));
 #endif
 void OPENSSL_cpuid_setup(void)
 	{
@ -68,6 +72,28 @@ void OPENSSL_cpuid_setup(void)
 		{
 		_armv7_neon_probe();
 		OPENSSL_armcap_P |= ARMV7_NEON;
+#ifdef __aarch64__
+		if (sigsetjmp(ill_jmp,1) == 0)
+			{
+			_armv8_pmull_probe();
+			OPENSSL_armcap_P |= ARMV8_PMULL|ARMV8_AES;
+			}
+		else if (sigsetjmp(ill_jmp,1) == 0)
+			{
+			_armv8_aes_probe();
+			OPENSSL_armcap_P |= ARMV8_AES;
+			}
+		if (sigsetjmp(ill_jmp,1) == 0)
+			{
+			_armv8_sha1_probe();
+			OPENSSL_armcap_P |= ARMV8_SHA1;
+			}
+		if (sigsetjmp(ill_jmp,1) == 0)
+			{
+			_armv8_sha256_probe();
+			OPENSSL_armcap_P |= ARMV8_SHA256;
+			}
+#endif
 		}
 	if (sigsetjmp(ill_jmp,1) == 0)
 		{
--- a/crypto/armv4cpuid.S
+++ b/crypto/armv4cpuid.S
@ -44,7 +44,7 @@ OPENSSL_atomic_add:
 	bne	.Lspin

 	ldr	r2,[r4]
-	add	r2,r5
+	add	r2,r2,r5
 	str	r2,[r4]
 	str	r0,[r6]		@ release spinlock
 	ldmia	sp!,{r4-r6,lr}
@ -59,26 +59,26 @@ OPENSSL_atomic_add:
 OPENSSL_cleanse:
 	eor	ip,ip,ip
 	cmp	r1,#7
-	subhs	r1,#4
+	subhs	r1,r1,#4
 	bhs	.Lot
 	cmp	r1,#0
 	beq	.Lcleanse_done
 .Little:
 	strb	ip,[r0],#1
-	subs	r1,#1
+	subs	r1,r1,#1
 	bhi	.Little
 	b	.Lcleanse_done

 .Lot:	tst	r0,#3
 	beq	.Laligned
 	strb	ip,[r0],#1
-	sub	r1,#1
+	sub	r1,r1,#1
 	b	.Lot
 .Laligned:
 	str	ip,[r0],#4
-	subs	r1,#4
+	subs	r1,r1,#4
 	bhs	.Laligned
-	adds	r1,#4
+	adds	r1,r1,#4
 	bne	.Little
 .Lcleanse_done:
 	tst	lr,#1
--- a/crypto/armv4cpuid_ios.S
+++ b/crypto/armv4cpuid_ios.S
@ -0,0 +1,210 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.align	5
+.globl	_OPENSSL_atomic_add
+
+_OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+Ladd:	ldrex	r2,[r0]
+	add	r3,r2,r1
+	strex	r2,r3,[r0]
+	cmp	r2,#0
+	bne	Ladd
+	mov	r0,r3
+	bx	lr
+#else
+	stmdb	sp!,{r4,r5,r6,lr}
+	ldr	r2,Lspinlock
+	adr	r3,Lspinlock
+	mov	r4,r0
+	mov	r5,r1
+	add	r6,r3,r2	@ &spinlock
+	b	.+8
+Lspin:	bl	sched_yield
+	mov	r0,#-1
+	swp	r0,r0,[r6]
+	cmp	r0,#0
+	bne	Lspin
+
+	ldr	r2,[r4]
+	add	r2,r2,r5
+	str	r2,[r4]
+	str	r0,[r6]		@ release spinlock
+	ldmia	sp!,{r4,r5,r6,lr}
+	tst	lr,#1
+	moveq	pc,lr
+.word	0xe12fff1e	@ bx	lr
+#endif
+
+
+.globl	_OPENSSL_cleanse
+
+_OPENSSL_cleanse:
+	eor	ip,ip,ip
+	cmp	r1,#7
+	subhs	r1,r1,#4
+	bhs	Lot
+	cmp	r1,#0
+	beq	Lcleanse_done
+Little:
+	strb	ip,[r0],#1
+	subs	r1,r1,#1
+	bhi	Little
+	b	Lcleanse_done
+
+Lot:	tst	r0,#3
+	beq	Laligned
+	strb	ip,[r0],#1
+	sub	r1,r1,#1
+	b	Lot
+Laligned:
+	str	ip,[r0],#4
+	subs	r1,r1,#4
+	bhs	Laligned
+	adds	r1,r1,#4
+	bne	Little
+Lcleanse_done:
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr
+.word	0xe12fff1e	@ bx	lr
+#endif
+
+
+
+
+.align	5
+.globl	__armv7_neon_probe
+
+__armv7_neon_probe:
+	vorr	q0,q0,q0
+	bx	lr
+
+
+.globl	__armv7_tick
+
+__armv7_tick:
+#ifdef	__APPLE__
+	mrrc	p15,0,r0,r1,c14		@ CNTPCT
+#else
+	mrrc	p15,1,r0,r1,c14		@ CNTVCT
+#endif
+	bx	lr
+
+
+.globl	__armv8_aes_probe
+
+__armv8_aes_probe:
+.byte	0x00,0x03,0xb0,0xf3	@ aese.8	q0,q0
+	bx	lr
+
+
+.globl	__armv8_sha1_probe
+
+__armv8_sha1_probe:
+.byte	0x40,0x0c,0x00,0xf2	@ sha1c.32	q0,q0,q0
+	bx	lr
+
+
+.globl	__armv8_sha256_probe
+
+__armv8_sha256_probe:
+.byte	0x40,0x0c,0x00,0xf3	@ sha256h.32	q0,q0,q0
+	bx	lr
+
+.globl	__armv8_pmull_probe
+
+__armv8_pmull_probe:
+.byte	0x00,0x0e,0xa0,0xf2	@ vmull.p64	q0,d0,d0
+	bx	lr
+
+.globl	_OPENSSL_wipe_cpu
+
+_OPENSSL_wipe_cpu:
+	ldr	r0,LOPENSSL_armcap
+	adr	r1,LOPENSSL_armcap
+	ldr	r0,[r1,r0]
+#ifdef	__APPLE__
+	ldr	r0,[r0]
+#endif
+	eor	r2,r2,r2
+	eor	r3,r3,r3
+	eor	ip,ip,ip
+	tst	r0,#1
+	beq	Lwipe_done
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+	veor	q3, q3, q3
+	veor	q8, q8, q8
+	veor	q9, q9, q9
+	veor	q10, q10, q10
+	veor	q11, q11, q11
+	veor	q12, q12, q12
+	veor	q13, q13, q13
+	veor	q14, q14, q14
+	veor	q15, q15, q15
+Lwipe_done:
+	mov	r0,sp
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr
+.word	0xe12fff1e	@ bx	lr
+#endif
+
+
+.globl	_OPENSSL_instrument_bus
+
+_OPENSSL_instrument_bus:
+	eor	r0,r0,r0
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr
+.word	0xe12fff1e	@ bx	lr
+#endif
+
+
+.globl	_OPENSSL_instrument_bus2
+
+_OPENSSL_instrument_bus2:
+	eor	r0,r0,r0
+#if __ARM_ARCH__>=5
+	bx	lr
+#else
+	tst	lr,#1
+	moveq	pc,lr
+.word	0xe12fff1e	@ bx	lr
+#endif
+
+
+.align	5
+LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.
+#if __ARM_ARCH__>=6
+.align	5
+#else
+Lspinlock:
+.word	atomic_add_spinlock-Lspinlock
+.align	5
+
+.data
+.align	2
+atomic_add_spinlock:
+.word
+#endif
+
+.comm	_OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol	_OPENSSL_armcap_P
+.long	0
+.private_extern	_OPENSSL_armcap_P
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@ -21,8 +21,20 @@
 # runs in even less cycles, ~30, improvement is measurable only on
 # longer keys. One has to optimize code elsewhere to get NEON glow...

-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}

 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
@ -170,11 +182,18 @@ bn_GF2m_mul_2x2:
 #if __ARM_ARCH__>=7
 	ldr	r12,.LOPENSSL_armcap
 .Lpic:	ldr	r12,[pc,r12]
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
 	tst	r12,#1
 	beq	.Lialu

 	veor	$A1,$A1
+#ifdef	__APPLE__
+	vmov	$B1,r3,r3		@ two copies of b1
+#else
 	vmov.32	$B1,r3,r3		@ two copies of b1
+#endif
 	vmov.32	${A1}[0],r1		@ a1

 	veor	$A0,$A0
@ -218,38 +237,38 @@ $code.=<<___;
 	mov	$b,r3			@ $b=b1
 	ldr	r3,[sp,#32]		@ load b0
 	mov	$mask,#7<<2
-	sub	sp,#32			@ allocate tab[8]
+	sub	sp,sp,#32		@ allocate tab[8]

 	bl	mul_1x1_ialu		@ a1·b1
 	str	$lo,[$ret,#8]
 	str	$hi,[$ret,#12]

-	eor	$b,r3			@ flip b0 and b1
-	 eor	$a,r2			@ flip a0 and a1
-	eor	r3,$b
-	 eor	r2,$a
-	eor	$b,r3
-	 eor	$a,r2
+	eor	$b,$b,r3		@ flip b0 and b1
+	 eor	$a,$a,r2		@ flip a0 and a1
+	eor	r3,r3,$b
+	 eor	r2,r2,$a
+	eor	$b,$b,r3
+	 eor	$a,$a,r2
 	bl	mul_1x1_ialu		@ a0·b0
 	str	$lo,[$ret]
 	str	$hi,[$ret,#4]

-	eor	$a,r2
-	eor	$b,r3
+	eor	$a,$a,r2
+	eor	$b,$b,r3
 	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
 ___
@r=map("r$_",(6..9));
 $code.=<<___;
 	ldmia	$ret,{@r[0]-@r[3]}
-	eor	$lo,$hi
-	eor	$hi,@r[1]
-	eor	$lo,@r[0]
-	eor	$hi,@r[2]
-	eor	$lo,@r[3]
-	eor	$hi,@r[3]
+	eor	$lo,$lo,$hi
+	eor	$hi,$hi,@r[1]
+	eor	$lo,$lo,@r[0]
+	eor	$hi,$hi,@r[2]
+	eor	$lo,$lo,@r[3]
+	eor	$hi,$hi,@r[3]
 	str	$hi,[$ret,#8]
-	eor	$lo,$hi
-	add	sp,#32			@ destroy tab[8]
+	eor	$lo,$lo,$hi
+	add	sp,sp,#32		@ destroy tab[8]
 	str	$lo,[$ret,#4]

 #if __ARM_ARCH__>=5
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@ -23,8 +23,20 @@
 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
 # about decorations, ABI and instruction syntax are identical.

-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}

 $num="r0";	# starts as num argument, but holds &tp[num-1]
 $ap="r1";
--- a/crypto/bn/asm/bn-c64xplus.asm
+++ b/crypto/bn/asm/bn-c64xplus.asm
@ -0,0 +1,333 @@
+;;====================================================================
+;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+;; project.
+;;
+;; Rights for redistribution and usage in source and binary forms are
+;; granted according to the OpenSSL license. Warranty of any kind is
+;; disclaimed.
+;;====================================================================
+;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
+;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
+;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
+;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
+;;====================================================================
+	.text
+
+	.asg	B3,RA
+	.asg	A4,ARG0
+	.asg	B4,ARG1
+	.asg	A6,ARG2
+	.asg	B6,ARG3
+	.asg	A8,ARG4
+	.asg	B8,ARG5
+	.asg	A4,RET
+	.asg	A15,FP
+	.asg	B14,DP
+	.asg	B15,SP
+
+	.global	_bn_mul_add_words
+_bn_mul_add_words:
+	.asmfunc
+	MV	ARG2,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A19		; high part of accumulator
+|| [B0]	MV	ARG0,A2
+|| [B0]	MV	ARG3,A3
+	NOP	3
+
+	SPLOOP	2		; 2*n+10
+;;====================================================================
+	LDW	*ARG1++,B7	; ap[i]
+	NOP	3
+	LDW	*ARG0++,A7	; rp[i]
+	MPY32U	B7,A3,A17:A16
+	NOP	3		; [2,0] in epilogue
+	ADDU	A16,A7,A21:A20
+	ADDU	A19,A21:A20,A19:A18
+||	MV.S	A17,A23
+	SPKERNEL 2,1		; leave slot for "return value"
+||	STW	A18,*A2++	; rp[i]
+||	ADD	A19,A23,A19
+;;====================================================================
+	BNOP	RA,4
+	MV	A19,RET		; return value
+	.endasmfunc
+
+	.global	_bn_mul_words
+_bn_mul_words:
+	.asmfunc
+	MV	ARG2,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A19		; high part of accumulator
+	NOP	3
+
+	SPLOOP	2		; 2*n+10
+;;====================================================================
+	LDW	*ARG1++,A7	; ap[i]
+	NOP	4
+	MPY32U	A7,ARG3,A17:A16
+	NOP	4		; [2,0] in epiloque
+	ADDU	A19,A16,A19:A18
+||	MV.S	A17,A21
+	SPKERNEL 2,1		; leave slot for "return value"
+||	STW	A18,*ARG0++	; rp[i]
+||	ADD.L	A19,A21,A19
+;;====================================================================
+	BNOP	RA,4
+	MV	A19,RET		; return value
+	.endasmfunc
+
+	.global	_bn_sqr_words
+_bn_sqr_words:
+	.asmfunc
+	MV	ARG2,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	MV	ARG0,B2
+|| [B0]	ADD	4,ARG0,ARG0
+	NOP	3
+
+	SPLOOP	2		; 2*n+10
+;;====================================================================
+	LDW	*ARG1++,B7	; ap[i]
+	NOP	4
+	MPY32U	B7,B7,B1:B0
+	NOP	3		; [2,0] in epilogue
+	STW	B0,*B2++(8)	; rp[2*i]
+	MV	B1,A1
+	SPKERNEL 2,0		; fully overlap BNOP RA,5
+||	STW	A1,*ARG0++(8)	; rp[2*i+1]
+;;====================================================================
+	BNOP	RA,5
+	.endasmfunc
+
+	.global	_bn_add_words
+_bn_add_words:
+	.asmfunc
+	MV	ARG3,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A1		; carry flag
+|| [B0]	MV	ARG0,A3
+	NOP	3
+
+	SPLOOP	2		; 2*n+6
+;;====================================================================
+	LDW	*ARG2++,A7	; bp[i]
+||	LDW	*ARG1++,B7	; ap[i]
+	NOP	4
+	ADDU	A7,B7,A9:A8
+	ADDU	A1,A9:A8,A1:A0
+	SPKERNEL 0,0		; fully overlap BNOP RA,5
+||	STW	A0,*A3++	; write result
+||	MV	A1,RET		; keep carry flag in RET
+;;====================================================================
+	BNOP	RA,5
+	.endasmfunc
+
+	.global	_bn_sub_words
+_bn_sub_words:
+	.asmfunc
+	MV	ARG3,B0
+  [!B0]	BNOP	RA
+||[!B0]	MVK	0,RET
+   [B0]	MVC	B0,ILC
+   [B0]	ZERO	A2		; borrow flag
+|| [B0]	MV	ARG0,A3
+	NOP	3
+
+	SPLOOP	2		; 2*n+6
+;;====================================================================
+	LDW	*ARG2++,A7	; bp[i]
+||	LDW	*ARG1++,B7	; ap[i]
+	NOP	4
+	SUBU	B7,A7,A1:A0
+  [A2]	SUB	A1:A0,1,A1:A0
+	SPKERNEL 0,1		; leave slot for "return borrow flag"
+||	STW	A0,*A3++	; write result
+||	AND	1,A1,A2		; pass on borrow flag
+;;====================================================================
+	BNOP	RA,4
+	AND	1,A1,RET	; return borrow flag
+	.endasmfunc
+
+	.global	_bn_div_words
+	.global	__divull
+_bn_div_words:
+	.asmfunc
+	CALLP	__divull,A3	; jump to rts64plus.lib
+||	MV	ARG0,A5
+||	MV	ARG1,ARG0
+||	MV	ARG2,ARG1
+||	ZERO	B5
+	.endasmfunc
+
+;;====================================================================
+;; Not really Comba algorithm, just straightforward NxM... Dedicated
+;; fully unrolled real Comba implementations are asymptotically 2x
+;; faster, but naturally larger undertaking. Purpose of this exercise
+;; was rather to learn to master nested SPLOOPs...
+;;====================================================================
+	.global	_bn_sqr_comba8
+	.global	_bn_mul_comba8
+_bn_sqr_comba8:
+	MV	ARG1,ARG2
+_bn_mul_comba8:
+	.asmfunc
+	MVK	8,B0		; N, RILC
+||	MVK	8,A0		; M, outer loop counter
+||	MV	ARG1,A5		; copy ap
+||	MV	ARG0,B4		; copy rp
+||	ZERO	B19		; high part of accumulator
+	MVC	B0,RILC
+||	SUB	B0,2,B1		; N-2, initial ILC
+||	SUB	B0,1,B2		; const B2=N-1
+||	LDW	*A5++,B6	; ap[0]
+||	MV	A0,A3		; const A3=M
+sploopNxM?:			; for best performance arrange M<=N
+   [A0]	SPLOOPD	2		; 2*n+10
+||	MVC	B1,ILC
+||	ADDAW	B4,B0,B5
+||	ZERO	B7
+||	LDW	*A5++,A9	; pre-fetch ap[1]
+||	ZERO	A1
+||	SUB	A0,1,A0
+;;====================================================================
+;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
+;; This is because of Advisory 15 from TI publication SPRZ247I.
+	LDW	*ARG2++,A7	; bp[i]
+	NOP	3
+   [A1]	LDW	*B5++,B7	; rp[i]
+	MPY32U	A7,B6,B17:B16
+	NOP	3
+	ADDU	B16,B7,B21:B20
+	ADDU	B19,B21:B20,B19:B18
+||	MV.S	B17,B23
+	SPKERNEL
+||	STW	B18,*B4++	; rp[i]
+||	ADD.S	B19,B23,B19
+;;====================================================================
+outer?:				; m*2*(n+1)+10
+	SUBAW	ARG2,A3,ARG2	; rewind bp to bp[0]
+	SPMASKR
+||	CMPGT	A0,1,A2		; done pre-fetching ap[i+1]?
+	MVD	A9,B6		; move through .M unit(*)
+   [A2]	LDW	*A5++,A9	; pre-fetch ap[i+1]
+	SUBAW	B5,B2,B5	; rewind rp to rp[1]
+	MVK	1,A1
+   [A0]	BNOP.S1	outer?,4
+|| [A0]	SUB.L	A0,1,A0
+	STW	B19,*B4--[B2]	; rewind rp tp rp[1]
+||	ZERO.S	B19		; high part of accumulator
+;; end of outer?
+	BNOP	RA,5		; return
+	.endasmfunc
+;; (*)	It should be noted that B6 is used as input to MPY32U in
+;;	chronologically next cycle in *preceding* SPLOOP iteration.
+;;	Normally such arrangement would require DINT, but at this
+;;	point SPLOOP is draining and interrupts are disabled
+;;	implicitly.
+
+	.global	_bn_sqr_comba4
+	.global	_bn_mul_comba4
+_bn_sqr_comba4:
+	MV	ARG1,ARG2
+_bn_mul_comba4:
+	.asmfunc
+	.if	0
+	BNOP	sploopNxM?,3
+	;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
+	;; because of read-after-write penalties, it's rather
+	;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
+	MVK	4,B0		; N, RILC
+||	MVK	4,A0		; M, outer loop counter
+||	MV	ARG1,A5		; copy ap
+||	MV	ARG0,B4		; copy rp
+||	ZERO	B19		; high part of accumulator
+	MVC	B0,RILC
+||	SUB	B0,2,B1		; first ILC
+||	SUB	B0,1,B2		; const B2=N-1
+||	LDW	*A5++,B6	; ap[0]
+||	MV	A0,A3		; const A3=M
+	.else
+	;; This alternative is exercise in fully unrolled Comba
+	;; algorithm implementation that operates at n*(n+1)+12, or
+	;; as little as 32 cycles...
+	LDW	*ARG1[0],B16	; a[0]
+||	LDW	*ARG2[0],A16	; b[0]
+	LDW	*ARG1[1],B17	; a[1]
+||	LDW	*ARG2[1],A17	; b[1]
+	LDW	*ARG1[2],B18	; a[2]
+||	LDW	*ARG2[2],A18	; b[2]
+	LDW	*ARG1[3],B19	; a[3]
+||	LDW	*ARG2[3],A19	; b[3]
+	NOP
+	MPY32U	A16,B16,A1:A0	; a[0]*b[0]
+	MPY32U	A17,B16,A23:A22	; a[0]*b[1]
+	MPY32U	A16,B17,A25:A24	; a[1]*b[0]
+	MPY32U	A16,B18,A27:A26	; a[2]*b[0]
+	STW	A0,*ARG0[0]
+||	MPY32U	A17,B17,A29:A28	; a[1]*b[1]
+	MPY32U	A18,B16,A31:A30	; a[0]*b[2]
+||	ADDU	A22,A1,A1:A0
+	MV	A23,B0
+||	MPY32U	A19,B16,A21:A20	; a[3]*b[0]
+||	ADDU	A24,A1:A0,A1:A0
+	ADDU	A25,B0,B1:B0
+||	STW	A0,*ARG0[1]
+||	MPY32U	A18,B17,A23:A22	; a[2]*b[1]
+||	ADDU	A26,A1,A9:A8
+	ADDU	A27,B1,B9:B8
+||	MPY32U	A17,B18,A25:A24	; a[1]*b[2]
+||	ADDU	A28,A9:A8,A9:A8
+	ADDU	A29,B9:B8,B9:B8
+||	MPY32U	A16,B19,A27:A26	; a[0]*b[3]
+||	ADDU	A30,A9:A8,A9:A8
+	ADDU	A31,B9:B8,B9:B8
+||	ADDU	B0,A9:A8,A9:A8
+	STW	A8,*ARG0[2]
+||	ADDU	A20,A9,A1:A0
+	ADDU	A21,B9,B1:B0
+||	MPY32U	A19,B17,A21:A20	; a[3]*b[1]
+||	ADDU	A22,A1:A0,A1:A0
+	ADDU	A23,B1:B0,B1:B0
+||	MPY32U	A18,B18,A23:A22	; a[2]*b[2]
+||	ADDU	A24,A1:A0,A1:A0
+	ADDU	A25,B1:B0,B1:B0
+||	MPY32U	A17,B19,A25:A24	; a[1]*b[3]
+||	ADDU	A26,A1:A0,A1:A0
+	ADDU	A27,B1:B0,B1:B0
+||	ADDU	B8,A1:A0,A1:A0
+	STW	A0,*ARG0[3]
+||	MPY32U	A19,B18,A27:A26	; a[3]*b[2]
+||	ADDU	A20,A1,A9:A8
+	ADDU	A21,B1,B9:B8
+||	MPY32U	A18,B19,A29:A28	; a[2]*b[3]
+||	ADDU	A22,A9:A8,A9:A8
+	ADDU	A23,B9:B8,B9:B8
+||	MPY32U	A19,B19,A31:A30	; a[3]*b[3]
+||	ADDU	A24,A9:A8,A9:A8
+	ADDU	A25,B9:B8,B9:B8
+||	ADDU	B0,A9:A8,A9:A8
+	STW	A8,*ARG0[4]
+||	ADDU	A26,A9,A1:A0
+	ADDU	A27,B9,B1:B0
+||	ADDU	A28,A1:A0,A1:A0
+	ADDU	A29,B1:B0,B1:B0
+||	BNOP	RA
+||	ADDU	B8,A1:A0,A1:A0
+	STW	A0,*ARG0[5]
+||	ADDU	A30,A1,A9:A8
+	ADD	A31,B1,B8
+	ADDU	B0,A9:A8,A9:A8	; removed || to avoid cross-path stall below
+	ADD	B8,A9,A9
+||	STW	A8,*ARG0[6]
+	STW	A9,*ARG0[7]
+	.endif
+	.endasmfunc
--- a/crypto/bn/asm/c64xplus-gf2m.pl
+++ b/crypto/bn/asm/c64xplus-gf2m.pl
@ -0,0 +1,146 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# February 2012
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... The subroutine runs in 37 cycles, which is
+# 4.5x faster than compiler-generated code. Though comparison is
+# totally unfair, because this module utilizes Galois Field Multiply
+# instruction.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector
+
+($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
+($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
+($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
+($A,$B)=($Alo,$B_1);
+$xFF="B1";
+
+sub mul_1x1_upper {
+my ($A,$B)=@_;
+$code.=<<___;
+	EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
+||	AND	$B,$xFF,$B_0
+||	SHRU	$B,24,$B_3
+	SHRU	$A,16,   $Ahi		; smash $A to two halfwords
+||	EXTU	$A,16,16,$Alo
+
+	XORMPY	$Alo,$B_2,$Alox2	; 16x8 bits muliplication
+||	XORMPY	$Ahi,$B_2,$Ahix2
+||	EXTU	$B,16,24,$B_1
+	XORMPY	$Alo,$B_0,$Alox0
+||	XORMPY	$Ahi,$B_0,$Ahix0
+	XORMPY	$Alo,$B_3,$Alox3
+||	XORMPY	$Ahi,$B_3,$Ahix3
+	XORMPY	$Alo,$B_1,$Alox1
+||	XORMPY	$Ahi,$B_1,$Ahix1
+___
+}
+sub mul_1x1_merged {
+my ($OUTlo,$OUThi,$A,$B)=@_;
+$code.=<<___;
+	 EXTU	$B,8,24,$B_2		; smash $B to 4 bytes
+||	 AND	$B,$xFF,$B_0
+||	 SHRU	$B,24,$B_3
+	 SHRU	$A,16,   $Ahi		; smash $A to two halfwords
+||	 EXTU	$A,16,16,$Alo
+
+	XOR	$Ahix0,$Alox2,$Ahix0
+||	MV	$Ahix2,$OUThi
+||	 XORMPY	$Alo,$B_2,$Alox2
+	 XORMPY	$Ahi,$B_2,$Ahix2
+||	 EXTU	$B,16,24,$B_1
+||	 XORMPY	$Alo,$B_0,A1		; $Alox0
+	XOR	$Ahix1,$Alox3,$Ahix1
+||	SHL	$Ahix0,16,$OUTlo
+||	SHRU	$Ahix0,16,$Ahix0
+	XOR	$Alox0,$OUTlo,$OUTlo
+||	XOR	$Ahix0,$OUThi,$OUThi
+||	 XORMPY	$Ahi,$B_0,$Ahix0
+||	 XORMPY	$Alo,$B_3,$Alox3
+||	SHL	$Alox1,8,$Alox1
+||	SHL	$Ahix3,8,$Ahix3
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix3,$OUThi,$OUThi
+||	 XORMPY	$Ahi,$B_3,$Ahix3
+||	SHL	$Ahix1,24,$Alox1
+||	SHRU	$Ahix1,8, $Ahix1
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix1,$OUThi,$OUThi
+||	 XORMPY	$Alo,$B_1,$Alox1
+||	 XORMPY	$Ahi,$B_1,$Ahix1
+||	 MV	A1,$Alox0
+___
+}
+sub mul_1x1_lower {
+my ($OUTlo,$OUThi)=@_;
+$code.=<<___;
+	;NOP
+	XOR	$Ahix0,$Alox2,$Ahix0
+||	MV	$Ahix2,$OUThi
+	NOP
+	XOR	$Ahix1,$Alox3,$Ahix1
+||	SHL	$Ahix0,16,$OUTlo
+||	SHRU	$Ahix0,16,$Ahix0
+	XOR	$Alox0,$OUTlo,$OUTlo
+||	XOR	$Ahix0,$OUThi,$OUThi
+||	SHL	$Alox1,8,$Alox1
+||	SHL	$Ahix3,8,$Ahix3
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix3,$OUThi,$OUThi
+||	SHL	$Ahix1,24,$Alox1
+||	SHRU	$Ahix1,8, $Ahix1
+	XOR	$Alox1,$OUTlo,$OUTlo
+||	XOR	$Ahix1,$OUThi,$OUThi
+___
+}
+$code.=<<___;
+	.text
+
+	.global	_bn_GF2m_mul_2x2
+_bn_GF2m_mul_2x2:
+	.asmfunc
+	MVK	0xFF,$xFF
+___
+	&mul_1x1_upper($a0,$b0);		# a0·b0
+$code.=<<___;
+||	MV	$b1,$B
+	MV	$a1,$A
+___
+	&mul_1x1_merged("A28","B28",$A,$B);	# a0·b0/a1·b1
+$code.=<<___;
+||	XOR	$b0,$b1,$B
+	XOR	$a0,$a1,$A
+___
+	&mul_1x1_merged("A31","B31",$A,$B);	# a1·b1/(a0+a1)·(b0+b1)
+$code.=<<___;
+	XOR	A28,A31,A29
+||	XOR	B28,B31,B29			; a0·b0+a1·b1
+___
+	&mul_1x1_lower("A30","B30");		# (a0+a1)·(b0+b1)
+$code.=<<___;
+||	BNOP	B3
+	XOR	A29,A30,A30
+||	XOR	B29,B30,B30			; (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	XOR	B28,A30,A30
+||	STW	A28,*${rp}[0]
+	XOR	B30,A31,A31
+||	STW	A30,*${rp}[1]
+	STW	A31,*${rp}[2]
+	STW	B31,*${rp}[3]
+	.endasmfunc
+___
+
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/mips-mont.pl
+++ b/crypto/bn/asm/mips-mont.pl
@ -46,7 +46,7 @@
 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
 #
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64

 if ($flavour =~ /64|n32/i) {
 	$PTR_ADD="dadd";	# incidentally works even on n32
@ -133,7 +133,7 @@ $code.=<<___;
 	bnez	$at,1f
 	li	$t0,0
 	slt	$at,$num,17	# on in-order CPU
-	bnezl	$at,bn_mul_mont_internal
+	bnez	$at,bn_mul_mont_internal
 	nop
 1:	jr	$ra
 	li	$a0,0
--- a/crypto/bn/asm/mips.pl
+++ b/crypto/bn/asm/mips.pl
@ -48,7 +48,7 @@
 # has to content with 40-85% improvement depending on benchmark and
 # key length, more for longer keys.

-$flavour = shift;
+$flavour = shift || "o32";
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";

@ -140,10 +140,10 @@ $code.=<<___;
 	.set	reorder
 	li	$minus4,-4
 	and	$ta0,$a2,$minus4
-	$LD	$t0,0($a1)
 	beqz	$ta0,.L_bn_mul_add_words_tail

 .L_bn_mul_add_words_loop:
+	$LD	$t0,0($a1)
 	$MULTU	$t0,$a3
 	$LD	$t1,0($a0)
 	$LD	$t2,$BNSZ($a1)
@ -200,10 +200,9 @@ $code.=<<___;
 	$ADDU	$v0,$ta2
 	sltu	$at,$ta3,$at
 	$ST	$ta3,-$BNSZ($a0)
-	$ADDU	$v0,$at
 	.set	noreorder
-	bgtzl	$ta0,.L_bn_mul_add_words_loop
-	$LD	$t0,0($a1)
+	bgtz	$ta0,.L_bn_mul_add_words_loop
+	$ADDU	$v0,$at

 	beqz	$a2,.L_bn_mul_add_words_return
 	nop
@ -267,7 +266,7 @@ ___
 $code.=<<___;
 	jr	$ra
 	move	$a0,$v0
-.end	bn_mul_add_words
+.end	bn_mul_add_words_internal

 .align	5
 .globl	bn_mul_words
@ -300,10 +299,10 @@ $code.=<<___;
 	.set	reorder
 	li	$minus4,-4
 	and	$ta0,$a2,$minus4
-	$LD	$t0,0($a1)
 	beqz	$ta0,.L_bn_mul_words_tail

 .L_bn_mul_words_loop:
+	$LD	$t0,0($a1)
 	$MULTU	$t0,$a3
 	$LD	$t2,$BNSZ($a1)
 	$LD	$ta0,2*$BNSZ($a1)
@ -341,10 +340,9 @@ $code.=<<___;
 	$ADDU	$v0,$at
 	sltu	$ta3,$v0,$at
 	$ST	$v0,-$BNSZ($a0)
-	$ADDU	$v0,$ta3,$ta2
 	.set	noreorder
-	bgtzl	$ta0,.L_bn_mul_words_loop
-	$LD	$t0,0($a1)
+	bgtz	$ta0,.L_bn_mul_words_loop
+	$ADDU	$v0,$ta3,$ta2

 	beqz	$a2,.L_bn_mul_words_return
 	nop
@ -429,10 +427,10 @@ $code.=<<___;
 	.set	reorder
 	li	$minus4,-4
 	and	$ta0,$a2,$minus4
-	$LD	$t0,0($a1)
 	beqz	$ta0,.L_bn_sqr_words_tail

 .L_bn_sqr_words_loop:
+	$LD	$t0,0($a1)
 	$MULTU	$t0,$t0
 	$LD	$t2,$BNSZ($a1)
 	$LD	$ta0,2*$BNSZ($a1)
@ -463,11 +461,10 @@ $code.=<<___;
 	mflo	$ta3
 	mfhi	$ta2
 	$ST	$ta3,-2*$BNSZ($a0)
-	$ST	$ta2,-$BNSZ($a0)

 	.set	noreorder
-	bgtzl	$ta0,.L_bn_sqr_words_loop
-	$LD	$t0,0($a1)
+	bgtz	$ta0,.L_bn_sqr_words_loop
+	$ST	$ta2,-$BNSZ($a0)

 	beqz	$a2,.L_bn_sqr_words_return
 	nop
@ -547,10 +544,10 @@ $code.=<<___;
 	.set	reorder
 	li	$minus4,-4
 	and	$at,$a3,$minus4
-	$LD	$t0,0($a1)
 	beqz	$at,.L_bn_add_words_tail

 .L_bn_add_words_loop:
+	$LD	$t0,0($a1)
 	$LD	$ta0,0($a2)
 	subu	$a3,4
 	$LD	$t1,$BNSZ($a1)
@ -589,11 +586,10 @@ $code.=<<___;
 	$ADDU	$t3,$ta3,$v0
 	sltu	$v0,$t3,$ta3
 	$ST	$t3,-$BNSZ($a0)
-	$ADDU	$v0,$t9
 	
 	.set	noreorder
-	bgtzl	$at,.L_bn_add_words_loop
-	$LD	$t0,0($a1)
+	bgtz	$at,.L_bn_add_words_loop
+	$ADDU	$v0,$t9

 	beqz	$a3,.L_bn_add_words_return
 	nop
@ -679,10 +675,10 @@ $code.=<<___;
 	.set	reorder
 	li	$minus4,-4
 	and	$at,$a3,$minus4
-	$LD	$t0,0($a1)
 	beqz	$at,.L_bn_sub_words_tail

 .L_bn_sub_words_loop:
+	$LD	$t0,0($a1)
 	$LD	$ta0,0($a2)
 	subu	$a3,4
 	$LD	$t1,$BNSZ($a1)
@ -722,11 +718,10 @@ $code.=<<___;
 	$SUBU	$t3,$ta3,$v0
 	sgtu	$v0,$t3,$ta3
 	$ST	$t3,-$BNSZ($a0)
-	$ADDU	$v0,$t9

 	.set	noreorder
-	bgtzl	$at,.L_bn_sub_words_loop
-	$LD	$t0,0($a1)
+	bgtz	$at,.L_bn_sub_words_loop
+	$ADDU	$v0,$t9

 	beqz	$a3,.L_bn_sub_words_return
 	nop
@ -778,7 +773,7 @@ ___
 $code.=<<___;
 	jr	$ra
 	move	$a0,$v0
-.end	bn_sub_words
+.end	bn_sub_words_internal

 .align 5
 .globl	bn_div_3_words
@ -819,7 +814,7 @@ ___
 $code.=<<___;
 	.set	reorder
 	move	$ta3,$ra
-	bal	bn_div_words
+	bal	bn_div_words_internal
 	move	$ra,$ta3
 	$MULTU	$ta2,$v0
 	$LD	$t2,-2*$BNSZ($a3)
@ -840,8 +835,9 @@ $code.=<<___;
 	sltu	$ta0,$a1,$a2
 	or	$t8,$ta0
 	.set	noreorder
-	beqzl	$at,.L_bn_div_3_words_inner_loop
+	beqz	$at,.L_bn_div_3_words_inner_loop
 	$SUBU	$v0,1
+	$ADDU	$v0,1
 	.set	reorder
 .L_bn_div_3_words_inner_loop_done:
 	.set	noreorder
@ -902,7 +898,8 @@ $code.=<<___;
 	and	$t2,$a0
 	$SRL	$at,$a1,$t1
 	.set	noreorder
-	bnezl	$t2,.+8
+	beqz	$t2,.+12
+	nop
 	break	6		# signal overflow
 	.set	reorder
 	$SLL	$a0,$t9
@ -917,7 +914,8 @@ $code.=<<___;
 	$SRL	$DH,$a2,4*$BNSZ	# bits
 	sgeu	$at,$a0,$a2
 	.set	noreorder
-	bnezl	$at,.+8
+	beqz	$at,.+12
+	nop
 	$SUBU	$a0,$a2
 	.set	reorder

@ -1874,6 +1872,41 @@ ___

 ($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);

+sub add_c2 () {
+my ($hi,$lo,$c0,$c1,$c2,
+    $warm,      # !$warm denotes first call with specific sequence of
+                # $c_[XYZ] when there is no Z-carry to accumulate yet;
+    $an,$bn     # these two are arguments for multiplication which
+                # result is used in *next* step [which is why it's
+                # commented as "forward multiplication" below];
+    )=@_;
+$code.=<<___;
+	mflo	$lo
+	mfhi	$hi
+	$ADDU	$c0,$lo
+	sltu	$at,$c0,$lo
+	 $MULTU	$an,$bn			# forward multiplication
+	$ADDU	$c0,$lo
+	$ADDU	$at,$hi
+	sltu	$lo,$c0,$lo
+	$ADDU	$c1,$at
+	$ADDU	$hi,$lo
+___
+$code.=<<___	if (!$warm);
+	sltu	$c2,$c1,$at
+	$ADDU	$c1,$hi
+	sltu	$hi,$c1,$hi
+	$ADDU	$c2,$hi
+___
+$code.=<<___	if ($warm);
+	sltu	$at,$c1,$at
+	$ADDU	$c1,$hi
+	$ADDU	$c2,$at
+	sltu	$hi,$c1,$hi
+	$ADDU	$c2,$hi
+___
+}
+
 $code.=<<___;

 .align	5
@ -1922,21 +1955,10 @@ $code.=<<___;
 	sltu	$at,$c_2,$t_1
 	$ADDU	$c_3,$t_2,$at
 	$ST	$c_2,$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_3,$t_1
@ -1947,67 +1969,19 @@ $code.=<<___;
 	sltu	$at,$c_1,$t_2
 	$ADDU	$c_2,$at
 	$ST	$c_3,2*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_3,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
+___
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+		$a_1,$a_2);		# mul_add_c2(a[1],b[2],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_4,$a_0);		# mul_add_c2(a[4],b[0],c2,c3,c1);
+$code.=<<___;
 	$ST	$c_1,3*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_1,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_1,$at
-	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
+___
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_2,$t_1
@ -2018,97 +1992,23 @@ $code.=<<___;
 	sltu	$at,$c_3,$t_2
 	$ADDU	$c_1,$at
 	$ST	$c_2,4*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_2,$at
-	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
-	$ADDU	$c_2,$at
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_1,$a_4);		# mul_add_c2(a[1],b[4],c3,c1,c2);
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+		$a_2,$a_3);		# mul_add_c2(a[2],b[3],c3,c1,c2);
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+		$a_6,$a_0);		# mul_add_c2(a[6],b[0],c1,c2,c3);
+$code.=<<___;
 	$ST	$c_3,5*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_3,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
+___
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+		$a_5,$a_1);		# mul_add_c2(a[5],b[1],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_4,$a_2);		# mul_add_c2(a[4],b[2],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_1,$t_1
@ -2119,112 +2019,25 @@ $code.=<<___;
 	sltu	$at,$c_2,$t_2
 	$ADDU	$c_3,$at
 	$ST	$c_1,6*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_1,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_1,$at
-	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_1,$at
-	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_1,$at
-	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
+___
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+		$a_1,$a_6);		# mul_add_c2(a[1],b[6],c2,c3,c1);
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+		$a_2,$a_5);		# mul_add_c2(a[2],b[5],c2,c3,c1);
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+		$a_3,$a_4);		# mul_add_c2(a[3],b[4],c2,c3,c1);
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+		$a_7,$a_1);		# mul_add_c2(a[7],b[1],c3,c1,c2);
+$code.=<<___;
 	$ST	$c_2,7*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_2,$at
-	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_2,$at
-	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_6,$a_2);		# mul_add_c2(a[6],b[2],c3,c1,c2);
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+		$a_5,$a_3);		# mul_add_c2(a[5],b[3],c3,c1,c2);
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+		$a_4,$a_4);		# mul_add_c(a[4],b[4],c3,c1,c2);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_3,$t_1
@ -2235,82 +2048,21 @@ $code.=<<___;
 	sltu	$at,$c_1,$t_2
 	$ADDU	$c_2,$at
 	$ST	$c_3,8*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_3,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
+___
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+		$a_3,$a_6);		# mul_add_c2(a[3],b[6],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_4,$a_5);		# mul_add_c2(a[4],b[5],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_7,$a_3);		# mul_add_c2(a[7],b[3],c2,c3,c1);
+$code.=<<___;
 	$ST	$c_1,9*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_1,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_1,$at
-	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
+___
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+		$a_6,$a_4);		# mul_add_c2(a[6],b[4],c2,c3,c1);
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
+		$a_5,$a_5);		# mul_add_c(a[5],b[5],c2,c3,c1);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_2,$t_1
@ -2321,52 +2073,17 @@ $code.=<<___;
 	sltu	$at,$c_3,$t_2
 	$ADDU	$c_1,$at
 	$ST	$c_2,10*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_2,$at
-	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_5,$a_6);		# mul_add_c2(a[5],b[6],c3,c1,c2);
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
+		$a_7,$a_5);		# mul_add_c2(a[7],b[5],c1,c2,c3);
+$code.=<<___;
 	$ST	$c_3,11*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_3,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
+___
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+		$a_6,$a_6);		# mul_add_c(a[6],b[6],c1,c2,c3);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_1,$t_1
@ -2377,21 +2094,10 @@ $code.=<<___;
 	sltu	$at,$c_2,$t_2
 	$ADDU	$c_3,$at
 	$ST	$c_1,12*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_1,$t_2,$zero
-	$SLL	$t_2,1
-	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
+___
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+		$a_7,$a_7);		# mul_add_c(a[7],b[7],c3,c1,c2);
+$code.=<<___;
 	$ST	$c_2,13*$BNSZ($a0)

 	mflo	$t_1
@ -2459,21 +2165,10 @@ $code.=<<___;
 	sltu	$at,$c_2,$t_1
 	$ADDU	$c_3,$t_2,$at
 	$ST	$c_2,$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_1,$a_1);		# mul_add_c(a[1],b[1],c3,c1,c2);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_3,$t_1
@ -2484,52 +2179,17 @@ $code.=<<___;
 	sltu	$at,$c_1,$t_2
 	$ADDU	$c_2,$at
 	$ST	$c_3,2*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_3,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$at,$t_2,$zero
-	$ADDU	$c_3,$at
-	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
-	$SLL	$t_2,1
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_1,$t_1
-	sltu	$at,$c_1,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_2,$t_2
-	sltu	$at,$c_2,$t_2
-	$ADDU	$c_3,$at
+___
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
+		$a_1,$a_2);		# mul_add_c2(a2[1],b[2],c1,c2,c3);
+	&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
+		$a_3,$a_1);		# mul_add_c2(a[3],b[1],c2,c3,c1);
+$code.=<<___;
 	$ST	$c_1,3*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_1,$t_2,$zero
-	$SLL	$t_2,1
-	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_2,$t_1
-	sltu	$at,$c_2,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_3,$t_2
-	sltu	$at,$c_3,$t_2
-	$ADDU	$c_1,$at
+___
+	&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
+		$a_2,$a_2);		# mul_add_c(a[2],b[2],c2,c3,c1);
+$code.=<<___;
 	mflo	$t_1
 	mfhi	$t_2
 	$ADDU	$c_2,$t_1
@ -2540,21 +2200,10 @@ $code.=<<___;
 	sltu	$at,$c_3,$t_2
 	$ADDU	$c_1,$at
 	$ST	$c_2,4*$BNSZ($a0)
-
-	mflo	$t_1
-	mfhi	$t_2
-	slt	$c_2,$t_2,$zero
-	$SLL	$t_2,1
-	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
-	slt	$a2,$t_1,$zero
-	$ADDU	$t_2,$a2
-	$SLL	$t_1,1
-	$ADDU	$c_3,$t_1
-	sltu	$at,$c_3,$t_1
-	$ADDU	$t_2,$at
-	$ADDU	$c_1,$t_2
-	sltu	$at,$c_1,$t_2
-	$ADDU	$c_2,$at
+___
+	&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
+		$a_3,$a_3);		# mul_add_c(a[3],b[3],c1,c2,c3);
+$code.=<<___;
 	$ST	$c_3,5*$BNSZ($a0)

 	mflo	$t_1
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@ -191,7 +191,7 @@ L1st:

 	addi	$j,$j,$BNSZ	; j++
 	addi	$tp,$tp,$BNSZ	; tp++
-	bdnz-	L1st
+	bdnz	L1st
 ;L1st
 	addc	$lo0,$alo,$hi0
 	addze	$hi0,$ahi
@ -253,7 +253,7 @@ Linner:
 	addze	$hi1,$hi1
 	$ST	$lo1,0($tp)	; tp[j-1]
 	addi	$tp,$tp,$BNSZ	; tp++
-	bdnz-	Linner
+	bdnz	Linner
 ;Linner
 	$LD	$tj,$BNSZ($tp)	; tp[j]
 	addc	$lo0,$alo,$hi0
@ -276,7 +276,7 @@ Linner:
 	slwi	$tj,$num,`log($BNSZ)/log(2)`
 	$UCMP	$i,$tj
 	addi	$i,$i,$BNSZ
-	ble-	Louter
+	ble	Louter

 	addi	$num,$num,2	; restore $num
 	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
@ -289,7 +289,7 @@ Lsub:	$LDX	$tj,$tp,$j
 	subfe	$aj,$nj,$tj	; tp[j]-np[j]
 	$STX	$aj,$rp,$j
 	addi	$j,$j,$BNSZ
-	bdnz-	Lsub
+	bdnz	Lsub

 	li	$j,0
 	mtctr	$num
@ -304,7 +304,7 @@ Lcopy:				; copy or in-place refresh
 	$STX	$tj,$rp,$j
 	$STX	$j,$tp,$j	; zap at once
 	addi	$j,$j,$BNSZ
-	bdnz-	Lcopy
+	bdnz	Lcopy

 	$POP	$tj,0($sp)
 	li	r3,1
--- a/crypto/bn/asm/ppc.pl
+++ b/crypto/bn/asm/ppc.pl
@ -952,7 +952,7 @@ $data=<<EOF;
 	addze	r11,r0
 					#mul_add_c(a[3],b[2],c3,c1,c2);
 	$LD	r6,`3*$BNSZ`(r4)
-	$LD	r7,`2*$BNSZ`(r4)
+	$LD	r7,`2*$BNSZ`(r5)
 	$UMULL	r8,r6,r7
 	$UMULH	r9,r6,r7
 	addc	r12,r8,r12
@ -1552,7 +1552,7 @@ Lppcasm_sub_mainloop:
 				# if carry = 1 this is r7-r8. Else it
 				# is r7-r8 -1 as we need.
 	$STU	r6,$BNSZ(r3)
-	bdnz-	Lppcasm_sub_mainloop
+	bdnz	Lppcasm_sub_mainloop
 Lppcasm_sub_adios:	
 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
 	andi.	r3,r3,1         # keep only last bit.
@ -1598,7 +1598,7 @@ Lppcasm_add_mainloop:
 	$LDU	r8,$BNSZ(r5)
 	adde	r8,r7,r8
 	$STU	r8,$BNSZ(r3)
-	bdnz-	Lppcasm_add_mainloop
+	bdnz	Lppcasm_add_mainloop
 Lppcasm_add_adios:	
 	addze	r3,r0			#return carry bit.
 	blr
@ -1755,7 +1755,7 @@ Lppcasm_sqr_mainloop:
 	$UMULH  r8,r6,r6
 	$STU	r7,$BNSZ(r3)
 	$STU	r8,$BNSZ(r3)
-	bdnz-	Lppcasm_sqr_mainloop
+	bdnz	Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:	
 	blr
 	.long	0
@ -1819,7 +1819,7 @@ Lppcasm_mw_LOOP:
 	
 	addi	r3,r3,`4*$BNSZ`
 	addi	r4,r4,`4*$BNSZ`
-	bdnz-	Lppcasm_mw_LOOP
+	bdnz	Lppcasm_mw_LOOP

 Lppcasm_mw_REM:
 	andi.	r5,r5,0x3
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
@ -561,7 +561,7 @@ $code.=<<___;
 	stfd	$T3b,`$FRAME+56`($sp)
 	 std	$t0,8($tp)		; tp[j-1]
 	 stdu	$t4,16($tp)		; tp[j]
-	bdnz-	L1st
+	bdnz	L1st

 	fctid	$dota,$dota
 	fctid	$dotb,$dotb
@ -856,7 +856,7 @@ $code.=<<___;
 	 addze	$carry,$carry
 	 std	$t3,-16($tp)		; tp[j-1]
 	 std	$t5,-8($tp)		; tp[j]
-	bdnz-	Linner
+	bdnz	Linner

 	fctid	$dota,$dota
 	fctid	$dotb,$dotb
@ -954,7 +954,7 @@ Lsub:	ldx	$t0,$tp,$i
 	stdx	$t0,$rp,$i
 	stdx	$t2,$t6,$i
 	addi	$i,$i,16
-	bdnz-	Lsub
+	bdnz	Lsub

 	li	$i,0
 	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
@ -981,7 +981,7 @@ Lcopy:				; copy or in-place refresh
 	stdx	$i,$tp,$i	; zap tp at once
 	stdx	$i,$t4,$i
 	addi	$i,$i,16
-	bdnz-	Lcopy
+	bdnz	Lcopy
 ___
 $code.=<<___ if ($SIZE_T==4);
 	subf	$np,$num,$np	; rewind np
@ -1014,7 +1014,7 @@ Lsub:	ld	$t0,8($tp)	; load tp[j..j+3] in 64-bit word order
 	stw	$t5,8($rp)
 	stw	$t6,12($rp)
 	stwu	$t7,16($rp)
-	bdnz-	Lsub
+	bdnz	Lsub

 	li	$i,0
 	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
@ -1046,7 +1046,7 @@ Lcopy:				; copy or in-place refresh
 	stwu	$t3,16($rp)
 	std	$i,8($tp)	; zap tp at once
 	stdu	$i,16($tp)
-	bdnz-	Lcopy
+	bdnz	Lcopy
 ___

 $code.=<<___;
--- a/crypto/bn/bn_nist.c
+++ b/crypto/bn/bn_nist.c
@ -366,6 +366,10 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
 # endif
 #endif /* BN_BITS2 != 64 */

+#if defined(_TMS320C6X) && defined(NIST_INT64)
+# undef NIST_INT64     /* compiler bug */
+# pragma diag_suppress 177
+#endif

 #define nist_set_192(to, from, a1, a2, a3) \
 	{ \
@ -1047,6 +1051,11 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	return 1;
 	}

+#ifdef _WIN32_WCE
+/* Workaround for compiler bug under CE */
+#pragma optimize( "", off )
+#endif
+
 #define BN_NIST_521_RSHIFT	(521%BN_BITS2)
 #define BN_NIST_521_LSHIFT	(BN_BITS2-BN_NIST_521_RSHIFT)
 #define BN_NIST_521_TOP_MASK	((BN_ULONG)BN_MASK2>>BN_NIST_521_LSHIFT)
@ -1113,6 +1122,10 @@ int BN_nist_mod_521(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	return 1;
 	}

+#ifdef _WIN32_WCE
+#pragma optimize( "", on )
+#endif
+
 int (*BN_nist_mod_func(const BIGNUM *p))(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, BN_CTX *ctx)
 	{
 	if (BN_ucmp(&_bignum_nist_p_192, p) == 0)
--- a/crypto/c64xcpuid.pl
+++ b/crypto/c64xcpuid.pl
@ -0,0 +1,326 @@
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$code.=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	OPENSSL_rdtsc,_OPENSSL_rdtsc
+	.asg	OPENSSL_cleanse,_OPENSSL_cleanse
+	.asg	CRYPTO_memcmp,_CRYPTO_memcmp
+	.asg	OPENSSL_atomic_add,_OPENSSL_atomic_add
+	.asg	OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu
+	.asg	OPENSSL_instrument_bus,_OPENSSL_instrument_bus
+	.asg	OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2
+	.endif
+
+	.asg	B3,RA
+	.asg	0x01AC0000,TIMER_BASE	; Timer 2
+
+	.global	_OPENSSL_rdtsc
+_OPENSSL_rdtsc:
+	.asmfunc
+	MVKL	TIMER_BASE,A5
+	MVKH	TIMER_BASE,A5
+	LDW	*A5[0],A2	; load CTL
+	LDW	*A5[2],A4	; load CTN
+	NOP	2
+	.if	.BIG_ENDIAN
+	MVK	0x2c0,A7	; internal clock source, don't hold, go
+||	MVK	-1,A6		; maximum period
+	.else
+	MVK	0x2c0,A6	; internal clock source, don't hold, go
+||	MVK	-1,A7		; maximum period
+	.endif
+  [!A2]	STDW	A7:A6,*A5[0]	; fire it up
+||	BNOP	RA,5
+	.endasmfunc
+
+	.global	_OPENSSL_cleanse
+_OPENSSL_cleanse:
+	.asmfunc
+	ZERO	A3:A2
+||	ZERO	B2
+||	SHRU	B4,3,B0		; is length >= 8
+||	ADD	1,A4,B6
+  [!B0]	BNOP	RA
+|| [B0]	SUB	B0,1,B2
+||	ZERO	A1
+||	ZERO	B1
+   [B2]	BDEC	cleanse_loop?,B2
+||[!B0]	CMPLT	0,B4,A1
+||[!B0]	CMPLT	1,B4,B1
+||	ZERO	B5
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
+||[!B0]	CMPLT	2,B4,A1
+||[!B0]	CMPLT	3,B4,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
+||[!B0]	CMPLT	4,B4,A1
+||[!B0]	CMPLT	5,B4,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
+||[!B0]	CMPLT	6,B4,A1
+   [A1]	STB	A2,*A4++[2]
+|| [B2]	BDEC	cleanse_loop?,B2
+
+cleanse_loop?:
+	STNDW	A3:A2,*A4++
+||	SUB	B4,8,B4
+|| [B2]	BDEC	cleanse_loop?,B2
+
+	MV	B4,B0		; remaining bytes
+||	ADD	1,A4,B6
+||	BNOP	RA
+   [B0]	CMPLT	0,B0,A1
+|| [B0]	CMPLT	1,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B0]	CMPLT	2,B0,A1
+|| [B0]	CMPLT	3,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B0]	CMPLT	4,B0,A1
+|| [B0]	CMPLT	5,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B5,*B6++[2]
+|| [B0]	CMPLT	6,B0,A1
+   [A1]	STB	A2,*A4++[2]
+	.endasmfunc
+
+	.if	0
+	.global	_CRYPTO_memcmp
+_CRYPTO_memcmp:
+	.asmfunc
+	MV	A6,B0
+  [!B0]	BNOP	RA
+||[!B0]	ZERO	A4
+|| [B0]	ZERO	A1:A0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+   [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+	XOR	A5,B5,A1
+|| [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+
+memcmp_loop?:
+	OR	A1,A0,A0
+||	XOR	A5,B5,A1
+|| [B0]	LDBU	*A4++,A5
+|| [B0]	LDBU	*B4++,B5
+|| [B0]	BDEC	memcmp_loop?,B0
+
+	BNOP	RA,3
+	ZERO	A4
+  [A0]	MVK	1,A4
+	.endasmfunc
+	.endif
+
+	.global	_OPENSSL_atomic_add
+_OPENSSL_atomic_add:
+	.asmfunc
+	BNOP	atomic_store?	; pre-C64x+ systems are uni-processor, it's
+||	LDW	*A4,B5		; enough to hold interrupts off through
+				; the load-update-store cycle to achieve
+				; atomicity
+	NOP
+	BNOP	RA,3		; and this branch stretches even over store
+	ADD	B4,B5,B5
+atomic_store?:
+	STW	B5,*A4
+||	MV	B5,A4
+	.endasmfunc
+
+	.global	_OPENSSL_wipe_cpu
+_OPENSSL_wipe_cpu:
+	.asmfunc
+	ZERO	A0
+||	ZERO	B0
+||	ZERO	A1
+||	ZERO	B1
+	ZERO	A3:A2
+||	MVD	B0,B2
+||	ZERO	A4
+||	ZERO	B4
+||	ZERO	A5
+||	ZERO	B5
+||	BNOP	RA
+	ZERO	A7:A6
+||	ZERO	B7:B6
+||	ZERO	A8
+||	ZERO	B8
+||	ZERO	A9
+||	ZERO	B9
+	ZERO	A17:A16
+||	ZERO	B17:B16
+||	ZERO	A18
+||	ZERO	B18
+||	ZERO	A19
+||	ZERO	B19
+	ZERO	A21:A20
+||	ZERO	B21:B20
+||	ZERO	A22
+||	ZERO	B22
+||	ZERO	A23
+||	ZERO	B23
+	ZERO	A25:A24
+||	ZERO	B25:B24
+||	ZERO	A26
+||	ZERO	B26
+||	ZERO	A27
+||	ZERO	B27
+	ZERO	A29:A28
+||	ZERO	B29:B28
+||	ZERO	A30
+||	ZERO	B30
+||	ZERO	A31
+||	ZERO	B31
+	.endasmfunc
+
+CLFLUSH	.macro	CONTROL,ADDR,LEN
+	B	passthrough?
+||	STW	ADDR,*CONTROL[0]
+	STW	LEN,*CONTROL[1]
+spinlock?:
+	LDW	*CONTROL[1],A0
+	NOP	3
+passthrough?:
+	NOP
+  [A0]	BNOP	spinlock?,5
+	.endm
+
+	.global	_OPENSSL_instrument_bus
+_OPENSSL_instrument_bus:
+	.asmfunc
+	MV	B4,B0			; reassign sizeof(output)
+||	MV	A4,B4			; reassign output
+||	MVK	0x00004030,A3
+||	MVKL	TIMER_BASE,B16
+	MV	B0,A4			; return value
+||	MVK	1,A1
+||	MVKH	0x01840000,A3		; L1DWIBAR
+||	MVKH	TIMER_BASE,B16
+	LDW	*B16[2],B8		; collect 1st tick
+||	MVK	0x00004010,A5
+	NOP	4
+	MV	B8,B9			; lasttick = tick
+||	MVK	0,B7			; lastdiff = 0
+||	MVKH	0x01840000,A5		; L2WIBAR
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LDW	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	STW	B5,*B4
+bus_loop1?:
+	LDW	*B16[2],B8
+|| [B0]	SUB	B0,1,B0
+	NOP	4
+	SUB	B8,B9,B7		; lastdiff = tick - lasttick
+||	MV	B8,B9			; lasttick = tick
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LDW	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	STW	B5,*B4			; [!B1] is removed to flatten samples
+||	ADDK	4,B4
+|| [B0]	BNOP	bus_loop1?,5
+
+	BNOP	RA,5
+	.endasmfunc
+
+	.global	_OPENSSL_instrument_bus2
+_OPENSSL_instrument_bus2:
+	.asmfunc
+	MV	A6,B0			; reassign max
+||	MV	B4,A6			; reassing sizeof(output)
+||	MVK	0x00004030,A3
+||	MVKL	TIMER_BASE,B16
+	MV	A4,B4			; reassign output
+||	MVK	0,A4			; return value
+||	MVK	1,A1
+||	MVKH	0x01840000,A3		; L1DWIBAR
+||	MVKH	TIMER_BASE,B16
+
+	LDW	*B16[2],B8		; collect 1st tick
+||	MVK	0x00004010,A5
+	NOP	4
+	MV	B8,B9			; lasttick = tick
+||	MVK	0,B7			; lastdiff = 0
+||	MVKH	0x01840000,A5		; L2WIBAR
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LDW	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	STW	B5,*B4
+
+	LDW	*B16[2],B8		; collect 1st diff
+	NOP	4
+	SUB	B8,B9,B7		; lastdiff = tick - lasttick
+||	MV	B8,B9			; lasttick = tick
+||	SUB	B0,1,B0
+bus_loop2?:
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LDW	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	STW	B5,*B4			; [!B1] is removed to flatten samples
+||[!B0]	BNOP	bus_loop2_done?,2
+||	SUB	B0,1,B0
+	LDW	*B16[2],B8
+	NOP	4
+	SUB	B8,B9,B8
+||	MV	B8,B9
+	CMPEQ	B8,B7,B2
+||	MV	B8,B7
+  [!B2]	ADDAW	B4,1,B4
+||[!B2]	ADDK	1,A4
+	CMPEQ	A4,A6,A2
+  [!A2]	BNOP	bus_loop2?,5
+
+bus_loop2_done?:
+	BNOP	RA,5
+	.endasmfunc
+
+	.if	__TI_EABI__
+	.sect	".init_array"
+	.else
+	.sect	".pinit"
+	.endif
+	.align	4
+	.long	_OPENSSL_rdtsc		; auto-start timer
+___
+
+print $code;
+close STDOUT;
--- a/crypto/c64xpluscpuid.pl
+++ b/crypto/c64xpluscpuid.pl
@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+#
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$code.=<<___;
+	.text
+
+	.asg	B3,RA
+
+	.global	_OPENSSL_rdtsc
+_OPENSSL_rdtsc:
+	.asmfunc
+	B	RA
+	MVC	TSCL,B0
+	MVC	TSCH,B1
+  [!B0]	MVC	B0,TSCL		; start TSC
+	MV	B0,A4
+	MV	B1,A5
+	.endasmfunc
+
+	.global	_OPENSSL_cleanse
+_OPENSSL_cleanse:
+	.asmfunc
+	ZERO	A3:A2
+||	ZERO	B2
+||	SHRU	B4,3,B0		; is length >= 8
+||	ADD	1,A4,B6
+  [!B0]	BNOP	RA
+||	ZERO	A1
+||	ZERO	B1
+   [B0]	MVC	B0,ILC
+||[!B0]	CMPLT	0,B4,A1
+||[!B0]	CMPLT	1,B4,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+||[!B0]	CMPLT	2,B4,A1
+||[!B0]	CMPLT	3,B4,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+||[!B0]	CMPLT	4,B4,A1
+||[!B0]	CMPLT	5,B4,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+||[!B0]	CMPLT	6,B4,A1
+   [A1]	STB	A2,*A4++[2]
+
+	SPLOOP	1
+	STNDW	A3:A2,*A4++
+||	SUB	B4,8,B4
+	SPKERNEL
+
+	MV	B4,B0		; remaining bytes
+||	ADD	1,A4,B6
+||	BNOP	RA
+   [B0]	CMPLT	0,B0,A1
+|| [B0]	CMPLT	1,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+|| [B0]	CMPLT	2,B0,A1
+|| [B0]	CMPLT	3,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+|| [B0]	CMPLT	4,B0,A1
+|| [B0]	CMPLT	5,B0,B1
+   [A1]	STB	A2,*A4++[2]
+|| [B1] STB	B2,*B6++[2]
+|| [B0]	CMPLT	6,B0,A1
+   [A1]	STB	A2,*A4++[2]
+	.endasmfunc
+
+	.global	_OPENSSL_atomic_add
+_OPENSSL_atomic_add:
+	.asmfunc
+	MV	A4,B0
+atomic_add?:
+	LL	*B0,B5
+	NOP	4
+	ADD	B4,B5,B5
+	SL	B5,*B0
+	CMTL	*B0,B1
+	NOP	4
+  [!B1]	B	atomic_add?
+   [B1]	BNOP	RA,4
+	MV	B5,A4
+	.endasmfunc
+
+	.global	_OPENSSL_wipe_cpu
+_OPENSSL_wipe_cpu:
+	.asmfunc
+	ZERO	A0
+||	ZERO	B0
+||	ZERO	A1
+||	ZERO	B1
+	ZERO	A3:A2
+||	MVD	B0,B2
+||	ZERO	A4
+||	ZERO	B4
+||	ZERO	A5
+||	ZERO	B5
+||	BNOP	RA
+	ZERO	A7:A6
+||	ZERO	B7:B6
+||	ZERO	A8
+||	ZERO	B8
+||	ZERO	A9
+||	ZERO	B9
+	ZERO	A17:A16
+||	ZERO	B17:B16
+||	ZERO	A18
+||	ZERO	B18
+||	ZERO	A19
+||	ZERO	B19
+	ZERO	A21:A20
+||	ZERO	B21:B20
+||	ZERO	A22
+||	ZERO	B22
+||	ZERO	A23
+||	ZERO	B23
+	ZERO	A25:A24
+||	ZERO	B25:B24
+||	ZERO	A26
+||	ZERO	B26
+||	ZERO	A27
+||	ZERO	B27
+	ZERO	A29:A28
+||	ZERO	B29:B28
+||	ZERO	A30
+||	ZERO	B30
+||	ZERO	A31
+||	ZERO	B31
+	.endasmfunc
+
+CLFLUSH	.macro	CONTROL,ADDR,LEN
+	B	passthrough?
+||	STW	ADDR,*CONTROL[0]
+	STW	LEN,*CONTROL[1]
+spinlock?:
+	LDW	*CONTROL[1],A0
+	NOP	3
+passthrough?:
+	NOP
+  [A0]	BNOP	spinlock?,5
+	.endm
+
+	.global	_OPENSSL_instrument_bus
+_OPENSSL_instrument_bus:
+	.asmfunc
+	MV	B4,B0			; reassign sizeof(output)
+||	MV	A4,B4			; reassign output
+||	MVK	0x00004030,A3
+	MV	B0,A4			; return value
+||	MVK	1,A1
+||	MVKH	0x01840000,A3		; L1DWIBAR
+	MVC	TSCL,B8			; collect 1st tick
+||	MVK	0x00004010,A5
+	MV	B8,B9			; lasttick = tick
+||	MVK	0,B7			; lastdiff = 0
+||	MVKH	0x01840000,A5		; L2WIBAR
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LL	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	SL	B5,*B4
+	CMTL	*B4,B1
+	NOP	4
+	STW	B5,*B4
+bus_loop1?:
+	MVC	TSCL,B8
+|| [B0]	SUB	B0,1,B0
+	SUB	B8,B9,B7		; lastdiff = tick - lasttick
+||	MV	B8,B9			; lasttick = tick
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LL	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	SL	B5,*B4
+	CMTL	*B4,B1
+	STW	B5,*B4			; [!B1] is removed to flatten samples
+||	ADDK	4,B4
+|| [B0]	BNOP	bus_loop1?,5
+
+	BNOP	RA,5
+	.endasmfunc
+
+	.global	_OPENSSL_instrument_bus2
+_OPENSSL_instrument_bus2:
+	.asmfunc
+	MV	A6,B0			; reassign max
+||	MV	B4,A6			; reassing sizeof(output)
+||	MVK	0x00004030,A3
+	MV	A4,B4			; reassign output
+||	MVK	0,A4			; return value
+||	MVK	1,A1
+||	MVKH	0x01840000,A3		; L1DWIBAR
+
+	MVC	TSCL,B8			; collect 1st tick
+||	MVK	0x00004010,A5
+	MV	B8,B9			; lasttick = tick
+||	MVK	0,B7			; lastdiff = 0
+||	MVKH	0x01840000,A5		; L2WIBAR
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LL	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	SL	B5,*B4
+	CMTL	*B4,B1
+	NOP	4
+	STW	B5,*B4
+
+	MVC	TSCL,B8			; collect 1st diff
+	SUB	B8,B9,B7		; lastdiff = tick - lasttick
+||	MV	B8,B9			; lasttick = tick
+||	SUB	B0,1,B0
+bus_loop2?:
+	CLFLUSH	A3,B4,A1		; write-back and invalidate L1D line
+	CLFLUSH	A5,B4,A1		; write-back and invalidate L2 line
+	LL	*B4,B5
+	NOP	4
+	ADD	B7,B5,B5
+	SL	B5,*B4
+	CMTL	*B4,B1
+	STW	B5,*B4			; [!B1] is removed to flatten samples
+||[!B0]	BNOP	bus_loop2_done?,2
+||	SUB	B0,1,B0
+	MVC	TSCL,B8
+	SUB	B8,B9,B8
+||	MV	B8,B9
+	CMPEQ	B8,B7,B2
+||	MV	B8,B7
+  [!B2]	ADDAW	B4,1,B4
+||[!B2]	ADDK	1,A4
+	CMPEQ	A4,A6,A2
+  [!A2]	BNOP	bus_loop2?,5
+
+bus_loop2_done?:
+	BNOP	RA,5
+	.endasmfunc
+___
+
+print $code;
+close STDOUT;
--- a/crypto/cmac/cmac.c
+++ b/crypto/cmac/cmac.c
@ -143,7 +143,8 @@ int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in)
 int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
 			const EVP_CIPHER *cipher, ENGINE *impl)
 	{
-	static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH];
+	__fips_constseg
+	static const unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH] = {0};
 	/* All zeros means restart */
 	if (!key && !cipher && !impl && keylen == 0)
 		{
--- a/crypto/cryptlib.c
+++ b/crypto/cryptlib.c
@ -359,7 +359,15 @@ void OPENSSL_showfatal (const char *fmta,...)
 { va_list ap;

    va_start (ap,fmta);
+#if defined(OPENSSL_SYS_VXWORKS)
+    {
+	char buf[256];
+	vsnprintf(buf,sizeof(buf),fmta,ap);
+	printf("%s",buf);
+    }
+#else 
    vfprintf (stderr,fmta,ap);
+#endif
    va_end (ap);
 }
 int OPENSSL_isservice (void) { return 0; }
@ -374,7 +382,9 @@ void OpenSSLDie(const char *file,int line,const char *assertion)
 	abort();
 #else
 	/* Win32 abort() customarily shows a dialog, but we just did that... */
+#ifdef SIGABRT
 	raise(SIGABRT);
+#endif
 	_exit(3);
 #endif
 	}
--- a/crypto/des/spr.h
+++ b/crypto/des/spr.h
@ -56,6 +56,9 @@
 * [including the GNU Public Licence.]
 */

+#ifdef _TMS320C6X
+#  pragma DATA_SECTION(DES_SPtrans,".const:des_sptrans")
+#endif
 __fips_constseg
 OPENSSL_GLOBAL const DES_LONG DES_SPtrans[8][64]={
 {
--- a/crypto/dsa/dsa.h
+++ b/crypto/dsa/dsa.h
@ -215,6 +215,11 @@ DSA_SIG * FIPS_dsa_sign_ctx(DSA *dsa, EVP_MD_CTX *ctx);
 int FIPS_dsa_verify_digest(DSA *dsa,
 				const unsigned char *dig, int dlen, DSA_SIG *s);
 int FIPS_dsa_verify_ctx(DSA *dsa, EVP_MD_CTX *ctx, DSA_SIG *s);
+int FIPS_dsa_verify(DSA *dsa, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash, DSA_SIG *s);
+DSA_SIG * FIPS_dsa_sign(DSA *dsa, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash);
+
 #endif

 DSA *	DSA_new(void);
--- a/crypto/dsa/dsa_gen.c
+++ b/crypto/dsa/dsa_gen.c
@ -666,7 +666,13 @@ int dsa_builtin_paramgen2(DSA *ret, size_t L, size_t N,
 			/* "offset = offset + n + 1" */

 			/* step 14 */
-			if (counter >= 4096) break;
+			if (counter >= (int)(4 * L)) break;
+			}
+		if (seed_in)
+			{
+			ok = 0;
+			DSAerr(DSA_F_DSA_BUILTIN_PARAMGEN2, DSA_R_INVALID_PARAMETERS);
+			goto err;
 			}
 		}
 end:
--- a/crypto/ec/ec2_smpl.c
+++ b/crypto/ec/ec2_smpl.c
@ -556,7 +556,7 @@ int ec_GF2m_simple_is_on_curve(const EC_GROUP *group, const EC_POINT *point, BN_
 	field_sqr = group->meth->field_sqr;	

 	/* only support affine coordinates */
-	if (!point->Z_is_one) goto err;
+	if (!point->Z_is_one) return -1;

 	if (ctx == NULL)
 		{
--- a/crypto/ec/ec_key.c
+++ b/crypto/ec/ec_key.c
@ -511,10 +511,12 @@ int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y)
 								tx, ty, ctx))
 			goto err;
 		}
-	/* Check if retrieved coordinates match originals: if not values
-	 * are out of range.
+	/* Check if retrieved coordinates match originals and are less than
+	 * field order: if not values are out of range.
 	 */
-	if (BN_cmp(x, tx) || BN_cmp(y, ty))
+	if (BN_cmp(x, tx) || BN_cmp(y, ty)
+		|| (BN_cmp(x, &key->group->field) >= 0)
+		|| (BN_cmp(y, &key->group->field) >= 0))
 		{
 		ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
 			EC_R_COORDINATES_OUT_OF_RANGE);
--- a/crypto/ecdh/ecdh.h
+++ b/crypto/ecdh/ecdh.h
@ -85,6 +85,8 @@
 extern "C" {
 #endif

+#define EC_FLAG_COFACTOR_ECDH	0x1000
+
 const ECDH_METHOD *ECDH_OpenSSL(void);

 void	  ECDH_set_default_method(const ECDH_METHOD *);
--- a/crypto/ecdh/ech_ossl.c
+++ b/crypto/ecdh/ech_ossl.c
@ -146,6 +146,18 @@ static int ecdh_compute_key(void *out, size_t outlen, const EC_POINT *pub_key,
 		}

 	group = EC_KEY_get0_group(ecdh);
+
+	if (EC_KEY_get_flags(ecdh) & EC_FLAG_COFACTOR_ECDH)
+		{
+		if (!EC_GROUP_get_cofactor(group, x, ctx) ||
+			!BN_mul(x, x, priv_key, ctx))
+			{
+			ECDHerr(ECDH_F_ECDH_COMPUTE_KEY, ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		priv_key = x;
+		}
+
 	if ((tmp=EC_POINT_new(group)) == NULL)
 		{
 		ECDHerr(ECDH_F_ECDH_COMPUTE_KEY,ERR_R_MALLOC_FAILURE);
--- a/crypto/ecdsa/ecdsa.h
+++ b/crypto/ecdsa/ecdsa.h
@ -236,6 +236,11 @@ ECDSA_SIG * FIPS_ecdsa_sign_ctx(EC_KEY *key, EVP_MD_CTX *ctx);
 int FIPS_ecdsa_verify_digest(EC_KEY *key,
 			const unsigned char *dig, int dlen, ECDSA_SIG *s);
 int FIPS_ecdsa_verify_ctx(EC_KEY *key, EVP_MD_CTX *ctx, ECDSA_SIG *s);
+int FIPS_ecdsa_verify(EC_KEY *key, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash, ECDSA_SIG *s);
+ECDSA_SIG * FIPS_ecdsa_sign(EC_KEY *key,
+			const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash);
 #endif


--- a/crypto/evp/e_aes.c
+++ b/crypto/evp/e_aes.c
@ -89,6 +89,10 @@ typedef struct
 	{
 	AES_KEY ks1, ks2;	/* AES key schedules to use */
 	XTS128_CONTEXT xts;
+	void     (*stream)(const unsigned char *in,
+			unsigned char *out, size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
 	} EVP_AES_XTS_CTX;

 typedef struct
@ -123,6 +127,9 @@ void vpaes_cbc_encrypt(const unsigned char *in,
 			unsigned char *ivec, int enc);
 #endif
 #ifdef BSAES_ASM
+void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
+			size_t length, const AES_KEY *key,
+			unsigned char ivec[16], int enc);
 void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
 			size_t len, const AES_KEY *key,
 			const unsigned char ivec[16]);
@ -133,6 +140,19 @@ void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
 			const unsigned char ivec[AES_BLOCK_SIZE]);
 #endif

+#if defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+extern int OPENSSL_ppccap_P;
+# define HWAES_CAPABLE  (OPENSSL_ppccap_P & (1<<2))
+# define HWAES_set_encrypt_key aes_p8_set_encrypt_key
+# define HWAES_set_decrypt_key aes_p8_set_decrypt_key
+# define HWAES_encrypt aes_p8_encrypt
+# define HWAES_decrypt aes_p8_decrypt
+# define HWAES_cbc_encrypt aes_p8_cbc_encrypt
+# define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
+# define HWAES_xts_encrypt aes_p8_xts_encrypt
+# define HWAES_xts_decrypt aes_p8_xts_decrypt
+#endif
+
 #if	defined(AES_ASM) && !defined(I386_ONLY) &&	(  \
 	((defined(__i386)	|| defined(__i386__)	|| \
 	  defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
@ -337,11 +357,13 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 			{
 			aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
 			xctx->xts.block1 = (block128_f)aesni_encrypt;
+			xctx->stream = aesni_xts_encrypt;
 			}
 		else
 			{
 			aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
 			xctx->xts.block1 = (block128_f)aesni_decrypt;
+			xctx->stream = aesni_xts_decrypt;
 			}

 		aesni_set_encrypt_key(key + ctx->key_len/2,
@ -360,32 +382,9 @@ static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	return 1;
 	}

+#define aesni_xts_cipher aes_xts_cipher
 static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
-		const unsigned char *in, size_t len)
-	{
-	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
-	if (!xctx->xts.key1 || !xctx->xts.key2)
-		return -1;
-	if (!out || !in)
-		return -1;
-#ifdef OPENSSL_FIPS
-	/* Requirement of SP800-38E */
-	if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
-			(len > (1L<<20)*16))
-		{
-		EVPerr(EVP_F_AESNI_XTS_CIPHER, EVP_R_TOO_LARGE);
-		return -1;
-		}
-#endif
-	if (ctx->encrypt)
-		aesni_xts_encrypt(in, out, len,
-			xctx->xts.key1, xctx->xts.key2, ctx->iv);
-	else
-		aesni_xts_decrypt(in, out, len,
-			xctx->xts.key1, xctx->xts.key2, ctx->iv);
-
-	return len;
-	}
+		const unsigned char *in, size_t len);

 static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
                        const unsigned char *iv, int enc)
@ -485,6 +484,42 @@ const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
 { return &aes_##keylen##_##mode; }
 #endif

+#if defined(OPENSSL_CPUID_OBJ) && defined(__aarch64__)
+#include "arm_arch.h"
+#if __ARM_ARCH__>=7
+# define HWAES_CAPABLE (OPENSSL_armcap_P & ARMV8_AES)
+# define HWAES_set_encrypt_key aes_v8_set_encrypt_key
+# define HWAES_set_decrypt_key aes_v8_set_decrypt_key
+# define HWAES_encrypt aes_v8_encrypt
+# define HWAES_decrypt aes_v8_decrypt
+# define HWAES_cbc_encrypt aes_v8_cbc_encrypt
+# define HWAES_ctr32_encrypt_blocks aes_v8_ctr32_encrypt_blocks
+#endif
+#endif
+
+#if defined(HWAES_CAPABLE)
+int HWAES_set_encrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+int HWAES_set_decrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+void HWAES_encrypt(const unsigned char *in, unsigned char *out,
+	const AES_KEY *key);
+void HWAES_decrypt(const unsigned char *in, unsigned char *out,
+	const AES_KEY *key);
+void HWAES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+	size_t length, const AES_KEY *key,
+	unsigned char *ivec, const int enc);
+void HWAES_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+	size_t len, const AES_KEY *key, const unsigned char ivec[16]);
+void HWAES_xts_encrypt(const unsigned char *inp, unsigned char *out,
+	size_t len, const AES_KEY *key1,
+	const AES_KEY *key2, const unsigned char iv[16]);
+void HWAES_xts_decrypt(const unsigned char *inp, unsigned char *out,
+	size_t len, const AES_KEY *key1,
+	const AES_KEY *key2, const unsigned char iv[16]);
+
+#endif
+
 #define BLOCK_CIPHER_generic_pack(nid,keylen,flags)		\
 	BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
 	BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
@ -503,6 +538,28 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	mode = ctx->cipher->flags & EVP_CIPH_MODE;
 	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
 	    && !enc)
+#ifdef HWAES_CAPABLE
+	    if (HWAES_CAPABLE)
+		{
+		ret = HWAES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block      = (block128_f)HWAES_decrypt;
+		dat->stream.cbc = NULL;
+#ifdef HWAES_cbc_encrypt
+		if (mode==EVP_CIPH_CBC_MODE)
+		    dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
+#endif
+		}
+	    else
+#endif
+#ifdef BSAES_CAPABLE
+	    if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
+		{
+		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_decrypt;
+		dat->stream.cbc	= (cbc128_f)bsaes_cbc_encrypt;
+		}
+	    else
+#endif
 #ifdef VPAES_CAPABLE
 	    if (VPAES_CAPABLE)
 		{
@ -522,6 +579,26 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 					NULL;
 		}
 	else
+#ifdef HWAES_CAPABLE
+	    if (HWAES_CAPABLE)
+		{
+		ret = HWAES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block      = (block128_f)HWAES_encrypt;
+		dat->stream.cbc = NULL;
+#ifdef HWAES_cbc_encrypt
+		if (mode==EVP_CIPH_CBC_MODE)
+		    dat->stream.cbc = (cbc128_f)HWAES_cbc_encrypt;
+		else
+#endif
+#ifdef HWAES_ctr32_encrypt_blocks
+		if (mode==EVP_CIPH_CTR_MODE)
+		    dat->stream.ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
+		else
+#endif
+		(void)0;	/* terminate potentially open 'else' */
+		}
+	    else
+#endif
 #ifdef BSAES_CAPABLE
 	    if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
 		{
@ -814,6 +891,21 @@ static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 		return 1;
 	if (key)
 		{ do {
+#ifdef HWAES_CAPABLE
+		if (HWAES_CAPABLE)
+			{
+			HWAES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+					(block128_f)HWAES_encrypt);
+#ifdef HWAES_ctr32_encrypt_blocks
+			gctx->ctr = (ctr128_f)HWAES_ctr32_encrypt_blocks;
+#else
+			gctx->ctr = NULL;
+#endif
+			break;
+			}
+		else
+#endif
 #ifdef BSAES_CAPABLE
 		if (BSAES_CAPABLE)
 			{
@ -961,8 +1053,6 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,

 	if (!gctx->iv_set)
 		return -1;
-	if (!ctx->encrypt && gctx->taglen < 0)
-		return -1;
 	if (in)
 		{
 		if (out == NULL)
@ -1004,6 +1094,8 @@ static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 		{
 		if (!ctx->encrypt)
 			{
+			if (gctx->taglen < 0)
+				return -1;
 			if (CRYPTO_gcm128_finish(&gctx->gcm,
 					ctx->buf, gctx->taglen) != 0)
 				return -1;
@ -1050,7 +1142,37 @@ static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,

 	if (key) do
 		{
+		xctx->stream = NULL;
 		/* key_len is two AES keys */
+#ifdef HWAES_CAPABLE
+		if (HWAES_CAPABLE)
+			{
+			if (enc)
+			    {
+			    HWAES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			    xctx->xts.block1 = (block128_f)HWAES_encrypt;
+#ifdef HWAES_xts_encrypt
+			    xctx->stream = HWAES_xts_encrypt;
+#endif
+			    }
+			else
+			    {
+			    HWAES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			    xctx->xts.block1 = (block128_f)HWAES_decrypt;
+#ifdef HWAES_xts_decrypt
+			    xctx->stream = HWAES_xts_decrypt;
+#endif
+			    }
+
+			HWAES_set_encrypt_key(key + ctx->key_len/2,
+						    ctx->key_len * 4, &xctx->ks2);
+			xctx->xts.block2 = (block128_f)HWAES_encrypt;
+
+			xctx->xts.key1 = &xctx->ks1;
+			break;
+			}
+		else
+#endif
 #ifdef VPAES_CAPABLE
 		if (VPAES_CAPABLE)
 		    {
@ -1105,22 +1227,25 @@ static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 	{
 	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
 	if (!xctx->xts.key1 || !xctx->xts.key2)
-		return -1;
+		return 0;
 	if (!out || !in)
-		return -1;
+		return 0;
 #ifdef OPENSSL_FIPS
 	/* Requirement of SP800-38E */
 	if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
-			(len > (1L<<20)*16))
+			(len > (1UL<<20)*16))
 		{
 		EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE);
-		return -1;
+		return 0;
 		}
 #endif
-	if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
+	if (xctx->stream)
+		(*xctx->stream)(in, out, len,
+				xctx->xts.key1, xctx->xts.key2, ctx->iv);
+	else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
 								ctx->encrypt))
-		return -1;
-	return len;
+		return 0;
+	return 1;
 	}

 #define aes_xts_cleanup NULL
@ -1190,6 +1315,19 @@ static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 		return 1;
 	if (key) do
 		{
+#ifdef HWAES_CAPABLE
+		if (HWAES_CAPABLE)
+			{
+			HWAES_set_encrypt_key(key,ctx->key_len*8,&cctx->ks);
+
+			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)HWAES_encrypt);
+			cctx->str = NULL;
+			cctx->key_set = 1;
+			break;
+			}
+		else
+#endif
 #ifdef VPAES_CAPABLE
 		if (VPAES_CAPABLE)
 			{
--- a/crypto/evp/evp_locl.h
+++ b/crypto/evp/evp_locl.h
@ -75,7 +75,7 @@ static int cname##_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const uns
 	return 1;\
 }

-#define EVP_MAXCHUNK ((size_t)1<<(sizeof(long)*8-2))
+#define EVP_MAXCHUNK ((size_t)1<<(sizeof(int)*8-2))

 #define BLOCK_CIPHER_func_ofb(cname, cprefix, cbits, kstruct, ksched) \
 static int cname##_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t inl) \
--- a/crypto/modes/Makefile
+++ b/crypto/modes/Makefile
@ -56,11 +56,16 @@ ghash-alpha.s:	asm/ghash-alpha.pl
 	$(PERL) $< | $(CC) -E - | tee $@ > /dev/null
 ghash-parisc.s:	asm/ghash-parisc.pl
 	$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
+ghashv8-armx.S:	asm/ghashv8-armx.pl
+	$(PERL) asm/ghashv8-armx.pl $(PERLASM_SCHEME) $@
+ghashp8-ppc.s:	asm/ghashp8-ppc.pl
+	$(PERL) asm/ghashp8-ppc.pl $(PERLASM_SCHEME) $@

 # GNU make "catch all"
 ghash-%.S:	asm/ghash-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@

 ghash-armv4.o:	ghash-armv4.S
+ghashv8-armx.o:	ghashv8-armx.S

 files:
 	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
--- a/crypto/modes/asm/ghash-armv4.pl
+++ b/crypto/modes/asm/ghash-armv4.pl
@ -57,8 +57,20 @@
 # *native* byte order on current platform. See gcm128.c for working
 # example...

-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}

 $Xi="r0";	# argument block
 $Htbl="r1";
@ -112,6 +124,11 @@ $code=<<___;
 .text
 .code	32

+#ifdef  __APPLE__
+#define ldrplb	ldrbpl
+#define ldrneb	ldrbne
+#endif
+
 .type	rem_4bit,%object
 .align	5
 rem_4bit:
@ -326,9 +343,9 @@ $code.=<<___;
 .align	4
 gcm_gmult_neon:
 	sub		$Htbl,#16		@ point at H in GCM128_CTX
-	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+	vld1.64		`&Dhi("$IN")`,[$Xi]!	@ load Xi
 	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
-	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
+	vld1.64		`&Dlo("$IN")`,[$Xi]!
 	vshr.u64	$mod,#32
 	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
 	veor		$zero,$zero
@ -349,9 +366,9 @@ gcm_gmult_neon:
 .type	gcm_ghash_neon,%function
 .align	4
 gcm_ghash_neon:
-	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
+	vld1.64		`&Dhi("$Z")`,[$Xi]!	@ load Xi
 	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
-	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
+	vld1.64		`&Dlo("$Z")`,[$Xi]!
 	vshr.u64	$mod,#32
 	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
 	veor		$zero,$zero
@ -410,8 +427,8 @@ gcm_ghash_neon:
 	vrev64.8	$Z,$Z
 #endif
 	sub		$Xi,#16	
-	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
-	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
+	vst1.64		`&Dhi("$Z")`,[$Xi]!	@ write out Xi
+	vst1.64		`&Dlo("$Z")`,[$Xi]

 	bx	lr
 .size	gcm_ghash_neon,.-gcm_ghash_neon
--- a/crypto/modes/asm/ghash-c64xplus.pl
+++ b/crypto/modes/asm/ghash-c64xplus.pl
@ -0,0 +1,231 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2011
+#
+# The module implements GCM GHASH function and underlying single
+# multiplication operation in GF(2^128). Even though subroutines
+# have _4bit suffix, they are not using any tables, but rely on
+# hardware Galois Field Multiply support. Streamed GHASH processes
+# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
+# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
+# comparing apples vs. oranges, but compiler surely could have done
+# better, because theoretical [though not necessarily achievable]
+# estimate for "4-bit" table-driven implementation is ~12 cycles.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6");	# arguments
+
+($Z0,$Z1,$Z2,$Z3,	$H0, $H1, $H2, $H3,
+			$H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
+($H01u,$H01y,$H2u,$H3u,	$H0y,$H1y,$H2y,$H3y,
+			$H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
+($FF000000,$E10000)=("B30","B31");
+($xip,$x0,$x1,$xib)=map("B$_",(6..9));	# $xip zaps $len
+ $xia="A9";
+($rem,$res)=("B4","B5");		# $rem zaps $Htable
+
+$code.=<<___;
+	.text
+
+	.asg	B3,RA
+
+	.if	0
+	.global	_gcm_gmult_1bit
+_gcm_gmult_1bit:
+	ADDAD	$Htable,2,$Htable
+	.endif
+	.global	_gcm_gmult_4bit
+_gcm_gmult_4bit:
+	.asmfunc
+	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
+	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
+||	MV	$Xip,${xip}		; reassign Xi
+||	MVK	15,B1			; SPLOOPD constant
+
+	MVK	0xE1,$E10000
+||	LDBU	*++${xip}[15],$x1	; Xi[15]
+	MVK	0xFF,$FF000000
+||	LDBU	*--${xip},$x0		; Xi[14]
+	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
+	SHL	$FF000000,24,$FF000000	; upper byte mask
+||	BNOP	ghash_loop?
+||	MVK	1,B0			; take a single spin
+
+	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
+	AND	$H2,$FF000000,$H2u	; H2's upper byte
+	AND	$H3,$FF000000,$H3u	; H3's upper byte
+||	SHRU	$H2u,8,$H2u
+	SHRU	$H3u,8,$H3u
+||	ZERO	$Z1:$Z0
+	SHRU2	$xia,8,$H01u
+||	ZERO	$Z3:$Z2
+	.endasmfunc
+
+	.global	_gcm_ghash_4bit
+_gcm_ghash_4bit:
+	.asmfunc
+	LDDW	*${Htable}[-1],$H1:$H0	; H.lo
+||	SHRU	$len,4,B0		; reassign len
+	LDDW	*${Htable}[-2],$H3:$H2	; H.hi
+||	MV	$Xip,${xip}		; reassign Xi
+||	MVK	15,B1			; SPLOOPD constant
+
+	MVK	0xE1,$E10000
+|| [B0]	LDNDW	*${inp}[1],$H1x:$H0x
+	MVK	0xFF,$FF000000
+|| [B0]	LDNDW	*${inp}++[2],$H3x:$H2x
+	SHL	$E10000,16,$E10000	; [pre-shifted] reduction polynomial
+||	LDDW	*${xip}[1],$Z1:$Z0
+	SHL	$FF000000,24,$FF000000	; upper byte mask
+||	LDDW	*${xip}[0],$Z3:$Z2
+
+	PACKH2	$H0,$H1,$xia		; pack H0' and H1's upper bytes
+	AND	$H2,$FF000000,$H2u	; H2's upper byte
+	AND	$H3,$FF000000,$H3u	; H3's upper byte
+||	SHRU	$H2u,8,$H2u
+	SHRU	$H3u,8,$H3u
+	SHRU2	$xia,8,$H01u
+
+|| [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+	.if	.LITTLE_ENDIAN
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.else
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.endif
+	STDW	$Z3:$Z2,*${xip}[0]
+|| [B0]	ZERO	$Z3:$Z2
+|| [B0]	MV	$xia,$x1
+   [B0]	ADDK	14,${xip}
+
+ghash_loop?:
+	SPLOOPD	6			; 6*16+7
+||	MVC	B1,ILC
+|| [B0]	SUB	B0,1,B0
+||	ZERO	A0
+||	ADD	$x1,$x1,$xib		; SHL	$x1,1,$xib
+||	SHL	$x1,1,$xia
+___
+
+########____________________________
+#  0    D2.     M1          M2      |
+#  1            M1                  |
+#  2            M1          M2      |
+#  3        D1. M1          M2      |
+#  4        S1. L1                  |
+#  5    S2  S1x L1          D2  L2  |____________________________
+#  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
+#  7/1          L1  S1  D1x S2  M2  |        M1                  |
+#  8/2              S1  L1x S2      |        M1          M2      |
+#  9/3              S1  L1x         |    D1. M1          M2      |
+# 10/4                  D1x         |    S1. L1                  |
+# 11/5                              |S2  S1x L1          D2  L2  |____________
+# 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
+#    7/1                                     L1  S1  D1x S2  M2  |        ....
+#    8/2                                         S1  L1x S2      |        ....
+#####...                                         ................|............
+$code.=<<___;
+	XORMPY	$H0,$xia,$H0x		; 0	; H·Xi[i]
+||	XORMPY	$H01u,$xib,$H01y
+|| [A0]	LDBU	*--${xip},$x0
+	XORMPY	$H1,$xia,$H1x		; 1
+	XORMPY	$H2,$xia,$H2x		; 2
+||	XORMPY	$H2u,$xib,$H2y
+	XORMPY	$H3,$xia,$H3x		; 3
+||	XORMPY	$H3u,$xib,$H3y
+||[!A0]	MVK.D	15,A0				; *--${xip} counter
+	XOR.L	$H0x,$Z0,$Z0		; 4	; Z^=H·Xi[i]
+|| [A0]	SUB.S	A0,1,A0
+	XOR.L	$H1x,$Z1,$Z1		; 5
+||	AND.D	$H01y,$FF000000,$H0z
+||	SWAP2.L	$H01y,$H1y		;	; SHL	$H01y,16,$H1y
+||	SHL	$x0,1,$xib
+||	SHL	$x0,1,$xia
+
+	XOR.L	$H2x,$Z2,$Z2		; 6/0	; [0,0] in epilogue
+||	SHL	$Z0,1,$rem		;	; rem=Z<<1
+||	SHRMB.S	$Z1,$Z0,$Z0		;	; Z>>=8
+||	AND.L	$H1y,$FF000000,$H1z
+	XOR.L	$H3x,$Z3,$Z3		; 7/1
+||	SHRMB.S	$Z2,$Z1,$Z1
+||	XOR.D	$H0z,$Z0,$Z0			; merge upper byte products
+||	AND.S	$H2y,$FF000000,$H2z
+||	XORMPY	$E10000,$rem,$res	;	; implicit rem&0x1FE
+	XOR.L	$H1z,$Z1,$Z1		; 8/2
+||	SHRMB.S	$Z3,$Z2,$Z2
+||	AND.S	$H3y,$FF000000,$H3z
+	XOR.L	$H2z,$Z2,$Z2		; 9/3
+||	SHRU	$Z3,8,$Z3
+	XOR.D	$H3z,$Z3,$Z3		; 10/4
+	NOP				; 11/5
+
+	SPKERNEL 0,2
+||	XOR.D	$res,$Z3,$Z3		; 12/6/0; Z^=res
+
+	; input pre-fetch is possible where D1 slot is available...
+   [B0]	LDNDW	*${inp}[1],$H1x:$H0x	; 8/-
+   [B0]	LDNDW	*${inp}++[2],$H3x:$H2x	; 9/-
+	NOP				; 10/-
+	.if	.LITTLE_ENDIAN
+	SWAP2	$Z0,$Z1			; 11/-
+||	SWAP4	$Z1,$Z0
+	SWAP4	$Z1,$Z1			; 12/-
+||	SWAP2	$Z0,$Z0
+	SWAP2	$Z2,$Z3
+||	SWAP4	$Z3,$Z2
+||[!B0]	BNOP	RA
+	SWAP4	$Z3,$Z3
+||	SWAP2	$Z2,$Z2
+|| [B0]	BNOP	ghash_loop?
+   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	SHRU	$Z1,24,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0]	SHRU	$Z1,16,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.else
+  [!B0]	BNOP	RA			; 11/-
+   [B0]	BNOP	ghash_loop?		; 12/-
+   [B0]	XOR	$H0x,$Z0,$Z0		; Xi^=inp
+|| [B0]	XOR	$H1x,$Z1,$Z1
+   [B0]	XOR	$H2x,$Z2,$Z2
+|| [B0]	XOR	$H3x,$Z3,$Z3
+|| [B0]	MV	$Z0,$xia		; Xi[15], avoid cross-path stall
+	STDW	$Z1:$Z0,*${xip}[1]
+|| [B0] SHRU	$Z0,8,$x0		; Xi[14]
+|| [B0]	ZERO	$Z1:$Z0
+	.endif
+	STDW	$Z3:$Z2,*${xip}[0]
+|| [B0]	ZERO	$Z3:$Z2
+|| [B0]	MV	$xia,$x1
+   [B0]	ADDK	14,${xip}
+	.endasmfunc
+
+	.sect	.const
+	.cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
--- a/crypto/modes/asm/ghashp8-ppc.pl
+++ b/crypto/modes/asm/ghashp8-ppc.pl
@ -0,0 +1,663 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for for PowerISA v2.07.
+#
+# July 2014
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This initial
+# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
+# faster than "4-bit" integer-only compiler-generated 64-bit code.
+# "Initial version" means that there is room for futher improvement.
+
+# May 2016
+#
+# 2x aggregated reduction improves performance by 50% (resulting
+# performance on POWER8 is 1 cycle per processed byte), and 4x
+# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
+	$STU="stdu";
+	$POP="ld";
+	$PUSH="std";
+	$UCMP="cmpld";
+	$SHRI="srdi";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
+	$STU="stwu";
+	$POP="lwz";
+	$PUSH="stw";
+	$UCMP="cmplw";
+	$SHRI="srwi";
+} else { die "nonsense $flavour"; }
+
+$sp="r1";
+$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
+
+my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
+my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
+my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
+my $vrsave="r12";
+
+$code=<<___;
+.machine	"any"
+
+.text
+
+.globl	.gcm_init_p8
+.align	5
+.gcm_init_p8:
+	li		r0,-4096
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$H,0,r4			# load H
+
+	vspltisb	$xC2,-16		# 0xf0
+	vspltisb	$t0,1			# one
+	vaddubm		$xC2,$xC2,$xC2		# 0xe0
+	vxor		$zero,$zero,$zero
+	vor		$xC2,$xC2,$t0		# 0xe1
+	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
+	vsldoi		$t1,$zero,$t0,1		# ...1
+	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
+	vspltisb	$t2,7
+	vor		$xC2,$xC2,$t1		# 0xc2....01
+	vspltb		$t1,$H,0		# most significant byte
+	vsl		$H,$H,$t0		# H<<=1
+	vsrab		$t1,$t1,$t2		# broadcast carry bit
+	vand		$t1,$t1,$xC2
+	vxor		$IN,$H,$t1		# twisted H
+
+	vsldoi		$H,$IN,$IN,8		# twist even more ...
+	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
+	vsldoi		$Hl,$zero,$H,8		# ... and split
+	vsldoi		$Hh,$H,$zero,8
+
+	stvx_u		$xC2,0,r3		# save pre-computed table
+	stvx_u		$Hl,r8,r3
+	li		r8,0x40
+	stvx_u		$H, r9,r3
+	li		r9,0x50
+	stvx_u		$Hh,r10,r3
+	li		r10,0x60
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$IN1,$Xl,$t1
+
+	vsldoi		$H2,$IN1,$IN1,8
+	vsldoi		$H2l,$zero,$H2,8
+	vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$H2l,r8,r3		# save H^2
+	li		r8,0x70
+	stvx_u		$H2,r9,r3
+	li		r9,0x80
+	stvx_u		$H2h,r10,r3
+	li		r10,0x90
+___
+{
+my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
+$code.=<<___;
+	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
+	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
+	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
+	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
+	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
+	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vsldoi		$t4,$Xm1,$zero,8
+	 vsldoi		$t5,$zero,$Xm1,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xl1,$Xl1,$t4
+	 vxor		$Xh1,$Xh1,$t5
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	 vsldoi		$Xl1,$Xl1,$Xl1,8
+	vxor		$Xl,$Xl,$t2
+	 vxor		$Xl1,$Xl1,$t6
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 vpmsumd	$Xl1,$Xl1,$xC2
+	vxor		$t1,$t1,$Xh
+	 vxor		$t5,$t5,$Xh1
+	vxor		$Xl,$Xl,$t1
+	 vxor		$Xl1,$Xl1,$t5
+
+	vsldoi		$H,$Xl,$Xl,8
+	 vsldoi		$H2,$Xl1,$Xl1,8
+	vsldoi		$Hl,$zero,$H,8
+	vsldoi		$Hh,$H,$zero,8
+	 vsldoi		$H2l,$zero,$H2,8
+	 vsldoi		$H2h,$H2,$zero,8
+
+	stvx_u		$Hl,r8,r3		# save H^3
+	li		r8,0xa0
+	stvx_u		$H,r9,r3
+	li		r9,0xb0
+	stvx_u		$Hh,r10,r3
+	li		r10,0xc0
+	 stvx_u		$H2l,r8,r3		# save H^4
+	 stvx_u		$H2,r9,r3
+	 stvx_u		$H2h,r10,r3
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_init_p8,.-.gcm_init_p8
+___
+}
+$code.=<<___;
+.globl	.gcm_gmult_p8
+.align	5
+.gcm_gmult_p8:
+	lis		r0,0xfff8
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$IN,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$zero,$zero,$zero
+
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,2,0
+	.long		0
+.size	.gcm_gmult_p8,.-.gcm_gmult_p8
+
+.globl	.gcm_ghash_p8
+.align	5
+.gcm_ghash_p8:
+	li		r0,-4096
+	li		r8,0x10
+	mfspr		$vrsave,256
+	li		r9,0x20
+	mtspr		256,r0
+	li		r10,0x30
+	lvx_u		$Xl,0,$Xip		# load Xi
+
+	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
+	li		r8,0x40
+	 le?lvsl	$lemask,r0,r0
+	lvx_u		$H, r9,$Htbl
+	li		r9,0x50
+	 le?vspltisb	$t0,0x07
+	lvx_u		$Hh,r10,$Htbl
+	li		r10,0x60
+	 le?vxor	$lemask,$lemask,$t0
+	lvx_u		$xC2,0,$Htbl
+	 le?vperm	$Xl,$Xl,$Xl,$lemask
+	vxor		$zero,$zero,$zero
+
+	${UCMP}i	$len,64
+	bge		Lgcm_ghash_p8_4x
+
+	lvx_u		$IN,0,$inp
+	addi		$inp,$inp,16
+	subic.		$len,$len,16
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$IN,$IN,$Xl
+	beq		Lshort
+
+	lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,16
+	lvx_u		$H2, r9,$Htbl
+	add		r9,$inp,$len		# end of input
+	lvx_u		$H2h,r10,$Htbl
+	be?b		Loop_2x
+
+.align	5
+Loop_2x:
+	lvx_u		$IN1,0,$inp
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	 subic		$len,$len,32
+	vpmsumd		$Xl,$IN,$H2l		# H^2.lo·Xi.lo
+	 vpmsumd	$Xl1,$IN1,$Hl		# H.lo·Xi+1.lo
+	 subfe		r0,r0,r0		# borrow?-1:0
+	vpmsumd		$Xm,$IN,$H2		# H^2.hi·Xi.lo+H^2.lo·Xi.hi
+	 vpmsumd	$Xm1,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+1.hi
+	 and		r0,r0,$len
+	vpmsumd		$Xh,$IN,$H2h		# H^2.hi·Xi.hi
+	 vpmsumd	$Xh1,$IN1,$Hh		# H.hi·Xi+1.hi
+	 add		$inp,$inp,r0
+
+	vxor		$Xl,$Xl,$Xl1
+	vxor		$Xm,$Xm,$Xm1
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh1
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+	 lvx_u		$IN,r8,$inp
+	 addi		$inp,$inp,32
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	 le?vperm	$IN,$IN,$IN,$lemask
+	vxor		$t1,$t1,$Xh
+	vxor		$IN,$IN,$t1
+	vxor		$IN,$IN,$Xl
+	$UCMP		r9,$inp
+	bgt		Loop_2x			# done yet?
+
+	cmplwi		$len,0
+	bne		Leven
+
+Lshort:
+	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
+	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
+	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+
+Leven:
+	vxor		$Xl,$Xl,$t1
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	mtspr		256,$vrsave
+	blr
+	.long		0
+	.byte		0,12,0x14,0,0,0,4,0
+	.long		0
+___
+{
+my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
+    $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
+my $IN0=$IN;
+my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
+
+$code.=<<___;
+.align	5
+.gcm_ghash_p8_4x:
+Lgcm_ghash_p8_4x:
+	$STU		$sp,-$FRAME($sp)
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	stvx		v20,r10,$sp
+	addi		r10,r10,32
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	li		r10,0x60
+	stvx		v31,r11,$sp
+	li		r0,-1
+	stw		$vrsave,`$FRAME-4`($sp)	# save vrsave
+	mtspr		256,r0			# preserve all AltiVec registers
+
+	lvsl		$t0,0,r8		# 0x0001..0e0f
+	#lvx_u		$H2l,r8,$Htbl		# load H^2
+	li		r8,0x70
+	lvx_u		$H2, r9,$Htbl
+	li		r9,0x80
+	vspltisb	$t1,8			# 0x0808..0808
+	#lvx_u		$H2h,r10,$Htbl
+	li		r10,0x90
+	lvx_u		$H3l,r8,$Htbl		# load H^3
+	li		r8,0xa0
+	lvx_u		$H3, r9,$Htbl
+	li		r9,0xb0
+	lvx_u		$H3h,r10,$Htbl
+	li		r10,0xc0
+	lvx_u		$H4l,r8,$Htbl		# load H^4
+	li		r8,0x10
+	lvx_u		$H4, r9,$Htbl
+	li		r9,0x20
+	lvx_u		$H4h,r10,$Htbl
+	li		r10,0x30
+
+	vsldoi		$t2,$zero,$t1,8		# 0x0000..0808
+	vaddubm		$hiperm,$t0,$t2		# 0x0001..1617
+	vaddubm		$loperm,$t1,$hiperm	# 0x0809..1e1f
+
+	$SHRI		$len,$len,4		# this allows to use sign bit
+						# as carry
+	lvx_u		$IN0,0,$inp		# load input
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,8
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	 vperm		$H21l,$H2,$H,$hiperm
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$H21h,$H2,$H,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
+	 vpmsumd	$Xl3,$t0,$H21l		# H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	 vpmsumd	$Xh3,$t1,$H21h		# H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
+
+	 vxor		$Xm2,$Xm2,$Xm1
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xm3,$Xm3,$Xm2
+	 vxor		$Xh3,$Xh3,$Xh1
+
+	blt		Ltail_4x
+
+Loop_4x:
+	lvx_u		$IN0,0,$inp
+	lvx_u		$IN1,r8,$inp
+	subic.		$len,$len,4
+	lvx_u		$IN2,r9,$inp
+	lvx_u		$IN3,r10,$inp
+	addi		$inp,$inp,0x40
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+	le?vperm	$IN3,$IN3,$IN3,$lemask
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+	 vpmsumd	$Xl1,$IN1,$H3l
+	 vpmsumd	$Xm1,$IN1,$H3
+	 vpmsumd	$Xh1,$IN1,$H3h
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+	vxor		$Xh,$Xh,$Xh3
+	 vperm		$t0,$IN2,$IN3,$loperm
+	 vperm		$t1,$IN2,$IN3,$hiperm
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+	 vpmsumd	$Xl3,$t0,$H21l		# H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
+	 vpmsumd	$Xh3,$t1,$H21h		# H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	 vpmsumd	$Xm2,$IN2,$H2		# H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
+	 vpmsumd	$Xm3,$IN3,$H		# H.hi·Xi+3.lo  +H.lo·Xi+3.hi
+	vpmsumd		$Xl,$Xl,$xC2
+
+	 vxor		$Xl3,$Xl3,$Xl1
+	 vxor		$Xh3,$Xh3,$Xh1
+	vxor		$Xh,$Xh,$IN0
+	 vxor		$Xm2,$Xm2,$Xm1
+	vxor		$Xh,$Xh,$t1
+	 vxor		$Xm3,$Xm3,$Xm2
+	vxor		$Xh,$Xh,$Xl
+	bge		Loop_4x
+
+Ltail_4x:
+	vpmsumd		$Xl,$Xh,$H4l		# H^4.lo·Xi.lo
+	vpmsumd		$Xm,$Xh,$H4		# H^4.hi·Xi.lo+H^4.lo·Xi.hi
+	vpmsumd		$Xh,$Xh,$H4h		# H^4.hi·Xi.hi
+
+	vxor		$Xl,$Xl,$Xl3
+	vxor		$Xm,$Xm,$Xm3
+
+	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
+
+	vsldoi		$t0,$Xm,$zero,8
+	vsldoi		$t1,$zero,$Xm,8
+	 vxor		$Xh,$Xh,$Xh3
+	vxor		$Xl,$Xl,$t0
+	vxor		$Xh,$Xh,$t1
+
+	vsldoi		$Xl,$Xl,$Xl,8
+	vxor		$Xl,$Xl,$t2
+
+	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
+	vpmsumd		$Xl,$Xl,$xC2
+	vxor		$t1,$t1,$Xh
+	vxor		$Xl,$Xl,$t1
+
+	addic.		$len,$len,4
+	beq		Ldone_4x
+
+	lvx_u		$IN0,0,$inp
+	${UCMP}i	$len,2
+	li		$len,-4
+	blt		Lone
+	lvx_u		$IN1,r8,$inp
+	beq		Ltwo
+
+Lthree:
+	lvx_u		$IN2,r9,$inp
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+	le?vperm	$IN2,$IN2,$IN2,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vmr		$H4l,$H3l
+	vmr		$H4, $H3
+	vmr		$H4h,$H3h
+
+	vperm		$t0,$IN1,$IN2,$loperm
+	vperm		$t1,$IN1,$IN2,$hiperm
+	vpmsumd		$Xm2,$IN1,$H2		# H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
+	vpmsumd		$Xm3,$IN2,$H		# H.hi·Xi+2.lo  +H.lo·Xi+2.hi
+	vpmsumd		$Xl3,$t0,$H21l		# H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
+	vpmsumd		$Xh3,$t1,$H21h		# H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
+
+	vxor		$Xm3,$Xm3,$Xm2
+	b		Ltail_4x
+
+.align	4
+Ltwo:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+	le?vperm	$IN1,$IN1,$IN1,$lemask
+
+	vxor		$Xh,$IN0,$Xl
+	vperm		$t0,$zero,$IN1,$loperm
+	vperm		$t1,$zero,$IN1,$hiperm
+
+	vsldoi		$H4l,$zero,$H2,8
+	vmr		$H4, $H2
+	vsldoi		$H4h,$H2,$zero,8
+
+	vpmsumd		$Xl3,$t0, $H21l		# H.lo·Xi+1.lo
+	vpmsumd		$Xm3,$IN1,$H		# H.hi·Xi+1.lo+H.lo·Xi+2.hi
+	vpmsumd		$Xh3,$t1, $H21h		# H.hi·Xi+1.hi
+
+	b		Ltail_4x
+
+.align	4
+Lone:
+	le?vperm	$IN0,$IN0,$IN0,$lemask
+
+	vsldoi		$H4l,$zero,$H,8
+	vmr		$H4, $H
+	vsldoi		$H4h,$H,$zero,8
+
+	vxor		$Xh,$IN0,$Xl
+	vxor		$Xl3,$Xl3,$Xl3
+	vxor		$Xm3,$Xm3,$Xm3
+	vxor		$Xh3,$Xh3,$Xh3
+
+	b		Ltail_4x
+
+Ldone_4x:
+	le?vperm	$Xl,$Xl,$Xl,$lemask
+	stvx_u		$Xl,0,$Xip		# write out Xi
+
+	li		r10,`15+6*$SIZE_T`
+	li		r11,`31+6*$SIZE_T`
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	addi		$sp,$sp,$FRAME
+	blr
+	.long		0
+	.byte		0,12,0x04,0,0x80,0,4,0
+	.long		0
+___
+}
+$code.=<<___;
+.size	.gcm_ghash_p8,.-.gcm_ghash_p8
+
+.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	if ($flavour =~ /le$/o) {	# little-endian
+	    s/le\?//o		or
+	    s/be\?/#be#/o;
+	} else {
+	    s/le\?/#le#/o	or
+	    s/be\?//o;
+	}
+	print $_,"\n";
+}
+
+close STDOUT; # enforce flush
--- a/crypto/modes/asm/ghashv8-armx.pl
+++ b/crypto/modes/asm/ghashv8-armx.pl
@ -0,0 +1,376 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
+#
+# June 2014
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
+# other assembly modules. Just like aesv8-armx.pl this module
+# supports both AArch32 and AArch64 execution modes.
+#
+# July 2014
+#
+# Implement 2x aggregated reduction [see ghash-x86.pl for background
+# information].
+#
+# Current performance in cycles per processed byte:
+#
+#		PMULL[2]	32-bit NEON(*)
+# Apple A7	0.92		5.62
+# Cortex-A53	1.01		8.39
+# Cortex-A57	1.17		7.61
+#
+# (*)	presented for reference/comparison purposes;
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$Xi="x0";	# argument block
+$Htbl="x1";
+$inp="x2";
+$len="x3";
+
+$inc="x12";
+
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+___
+$code.=".arch	armv8-a+crypto\n"	if ($flavour =~ /64/);
+$code.=".fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
+
+$code.=<<___;
+.global	gcm_init_v8
+.type	gcm_init_v8,%function
+.align	4
+gcm_init_v8:
+	vld1.64		{$t1},[x1]		@ load H
+	vmov.i8		$xC2,#0xe1
+	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
+	vext.8		$IN,$t1,$t1,#8
+	vshr.u64	$t2,$xC2,#63
+	vdup.32		$t1,${t1}[1]
+	vext.8		$t0,$t2,$xC2,#8		@ t0=0xc2....01
+	vshr.u64	$t2,$IN,#63
+	vshr.s32	$t1,$t1,#31		@ broadcast carry bit
+	vand		$t2,$t2,$t0
+	vshl.i64	$IN,$IN,#1
+	vext.8		$t2,$t2,$t2,#8
+	vand		$t0,$t0,$t1
+	vorr		$IN,$IN,$t2		@ H<<<=1
+	veor		$H,$IN,$t0		@ twisted H
+	vst1.64		{$H},[x0],#16
+
+	@ calculate H^2
+	vext.8		$t0,$H,$H,#8		@ Karatsuba pre-processing
+	vpmull.p64	$Xl,$H,$H
+	veor		$t0,$t0,$H
+	vpmull2.p64	$Xh,$H,$H
+	vpmull.p64	$Xm,$t0,$t0
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$H2,$Xl,$t2
+
+	vext.8		$t1,$H2,$H2,#8		@ Karatsuba pre-processing
+	veor		$t1,$t1,$H2
+	vext.8		$Hhl,$t0,$t1,#8		@ pack Karatsuba pre-processed
+	vst1.64		{$Hhl-$H2},[x0]
+
+	ret
+.size	gcm_init_v8,.-gcm_init_v8
+
+.global	gcm_gmult_v8
+.type	gcm_gmult_v8,%function
+.align	4
+gcm_gmult_v8:
+	vld1.64		{$t1},[$Xi]		@ load Xi
+	vmov.i8		$xC2,#0xe1
+	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
+	vshl.u64	$xC2,$xC2,#57
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$IN,$t1,$t1,#8
+
+	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+	ret
+.size	gcm_gmult_v8,.-gcm_gmult_v8
+
+.global	gcm_ghash_v8
+.type	gcm_ghash_v8,%function
+.align	4
+gcm_ghash_v8:
+___
+$code.=<<___		if ($flavour !~ /64/);
+	vstmdb		sp!,{d8-d15}
+___
+$code.=<<___;
+	vld1.64		{$Xl},[$Xi]		@ load [rotated] Xi
+	subs		$len,$len,#32
+	vmov.i8		$xC2,#0xe1
+	mov		$inc,#16
+	vld1.64		{$H-$Hhl},[$Htbl],#32	@ load twisted H, ..., H^2
+	vld1.64		{$H2},[$Htbl]
+	cclr		$inc,eq
+	vext.8		$Xl,$Xl,$Xl,#8
+	vld1.64		{$t0},[$inp],#16	@ load [rotated] I[0]
+	vshl.u64	$xC2,$xC2,#57		@ 0xc2.0
+#ifndef __ARMEB__
+	vrev64.8	$t0,$t0
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$IN,$t0,$t0,#8
+	b.lo		.Lodd_tail_v8
+___
+{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+$code.=<<___;
+	vld1.64		{$t1},[$inp],$inc	@ load [rotated] I[1]
+#ifndef __ARMEB__
+	vrev64.8	$t1,$t1
+#endif
+	vext.8		$In,$t1,$t1,#8
+	veor		$IN,$IN,$Xl		@ I[i]^=Xi
+	vpmull.p64	$Xln,$H,$In		@ H·Ii+1
+	veor		$t1,$t1,$In		@ Karatsuba pre-processing
+	vpmull2.p64	$Xhn,$H,$In
+	b		.Loop_mod2x_v8
+
+.align	4
+.Loop_mod2x_v8:
+	vext.8		$t2,$IN,$IN,#8
+	subs		$len,$len,#32
+	vpmull.p64	$Xl,$H2,$IN		@ H^2.lo·Xi.lo
+	cclr		$inc,lo
+
+	 vpmull.p64	$Xmn,$Hhl,$t1
+	veor		$t2,$t2,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H2,$IN		@ H^2.hi·Xi.hi
+	veor		$Xl,$Xl,$Xln		@ accumulate
+	vpmull2.p64	$Xm,$Hhl,$t2		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	 vld1.64	{$t0},[$inp],$inc	@ load [rotated] I[i]
+
+	veor		$Xh,$Xh,$Xhn
+	 cclr		$inc,eq
+	veor		$Xm,$Xm,$Xmn
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	 vld1.64	{$t1},[$inp],$inc	@ load [rotated] I[i+1]
+#ifndef __ARMEB__
+	 vrev64.8	$t0,$t0
+#endif
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+#ifndef __ARMEB__
+	 vrev64.8	$t1,$t1
+#endif
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	 vext.8		$In,$t1,$t1,#8
+	 vext.8		$IN,$t0,$t0,#8
+	veor		$Xl,$Xm,$t2
+	 vpmull.p64	$Xln,$H,$In		@ H·Ii+1
+	veor		$IN,$IN,$Xh		@ accumulate $IN early
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$IN,$IN,$t2
+	 veor		$t1,$t1,$In		@ Karatsuba pre-processing
+	veor		$IN,$IN,$Xl
+	 vpmull2.p64	$Xhn,$H,$In
+	b.hs		.Loop_mod2x_v8
+
+	veor		$Xh,$Xh,$t2
+	vext.8		$IN,$t0,$t0,#8		@ re-construct $IN
+	adds		$len,$len,#32
+	veor		$Xl,$Xl,$Xh		@ re-construct $Xl
+	b.eq		.Ldone_v8
+___
+}
+$code.=<<___;
+.Lodd_tail_v8:
+	vext.8		$t2,$Xl,$Xl,#8
+	veor		$IN,$IN,$Xl		@ inp^=Xi
+	veor		$t1,$t0,$t2		@ $t1 is rotated inp^Xi
+
+	vpmull.p64	$Xl,$H,$IN		@ H.lo·Xi.lo
+	veor		$t1,$t1,$IN		@ Karatsuba pre-processing
+	vpmull2.p64	$Xh,$H,$IN		@ H.hi·Xi.hi
+	vpmull.p64	$Xm,$Hhl,$t1		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8		$t1,$Xl,$Xh,#8		@ Karatsuba post-processing
+	veor		$t2,$Xl,$Xh
+	veor		$Xm,$Xm,$t1
+	veor		$Xm,$Xm,$t2
+	vpmull.p64	$t2,$Xl,$xC2		@ 1st phase
+
+	vmov		$Xh#lo,$Xm#hi		@ Xh|Xm - 256-bit result
+	vmov		$Xm#hi,$Xl#lo		@ Xm is rotated Xl
+	veor		$Xl,$Xm,$t2
+
+	vext.8		$t2,$Xl,$Xl,#8		@ 2nd phase
+	vpmull.p64	$Xl,$Xl,$xC2
+	veor		$t2,$t2,$Xh
+	veor		$Xl,$Xl,$t2
+
+.Ldone_v8:
+#ifndef __ARMEB__
+	vrev64.8	$Xl,$Xl
+#endif
+	vext.8		$Xl,$Xl,$Xl,#8
+	vst1.64		{$Xl},[$Xi]		@ write out Xi
+
+___
+$code.=<<___		if ($flavour !~ /64/);
+	vldmia		sp!,{d8-d15}
+___
+$code.=<<___;
+	ret
+.size	gcm_ghash_v8,.-gcm_ghash_v8
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+if ($flavour =~ /64/) {			######## 64-bit code
+    sub unvmov {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+	sprintf	"ins	v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+    }
+    foreach(split("\n",$code)) {
+	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
+	s/vmov\.i8/movi/o		or	# fix up legacy mnemonics
+	s/vmov\s+(.*)/unvmov($1)/geo	or
+	s/vext\.8/ext/o			or
+	s/vshr\.s/sshr\.s/o		or
+	s/vshr/ushr/o			or
+	s/^(\s+)v/$1/o			or	# strip off v prefix
+	s/\bbx\s+lr\b/ret/o;
+
+	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
+	s/@\s/\/\//o;				# old->new style commentary
+
+	# fix up remainig legacy suffixes
+	s/\.[ui]?8(\s)/$1/o;
+	s/\.[uis]?32//o and s/\.16b/\.4s/go;
+	m/\.p64/o and s/\.16b/\.1q/o;		# 1st pmull argument
+	m/l\.p64/o and s/\.16b/\.1d/go;		# 2nd and 3rd pmull arguments
+	s/\.[uisp]?64//o and s/\.16b/\.2d/go;
+	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+	print $_,"\n";
+    }
+} else {				######## 32-bit code
+    sub unvdup32 {
+	my $arg=shift;
+
+	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+    }
+    sub unvpmullp64 {
+	my ($mnemonic,$arg)=@_;
+
+	if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
+	    my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
+				 |(($2&7)<<17)|(($2&8)<<4)
+				 |(($3&7)<<1) |(($3&8)<<2);
+	    $word |= 0x00010001	 if ($mnemonic =~ "2");
+	    # since ARMv7 instructions are always encoded little-endian.
+	    # correct solution is to use .inst directive, but older
+	    # assemblers don't implement it:-(
+	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+			$word&0xff,($word>>8)&0xff,
+			($word>>16)&0xff,($word>>24)&0xff,
+			$mnemonic,$arg;
+	}
+    }
+
+    foreach(split("\n",$code)) {
+	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
+	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
+	s/\/\/\s?/@ /o;				# new->old style commentary
+
+	# fix up remainig new-style suffixes
+	s/\],#[0-9]+/]!/o;
+
+	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o			or
+	s/vdup\.32\s+(.*)/unvdup32($1)/geo				or
+	s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo		or
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/^(\s+)b\./$1b/o						or
+	s/^(\s+)ret/$1bx\tlr/o;
+
+	print $_,"\n";
+    }
+}
+
+close STDOUT; # enforce flush
--- a/crypto/modes/gcm128.c
+++ b/crypto/modes/gcm128.c
@ -645,7 +645,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])

 #endif

-#if	TABLE_BITS==4 && defined(GHASH_ASM)
+#if	TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
 # if	!defined(I386_ONLY) && \
 	(defined(__i386)	|| defined(__i386__)	|| \
 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
@ -666,14 +666,33 @@ void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len
 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 #  endif
-# elif defined(__arm__) || defined(__arm)
+# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
 #  include "arm_arch.h"
 #  if __ARM_ARCH__>=7
 #   define GHASH_ASM_ARM
 #   define GCM_FUNCREF_4BIT
+#   if defined(__aarch64__)
+#    define PMULL_CAPABLE	(OPENSSL_armcap_P & ARMV8_PMULL)
+#   endif
+#   if defined(__arm__) || defined(__arm)
+#    define NEON_CAPABLE	(OPENSSL_armcap_P & ARMV7_NEON)
+#   endif
 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 #  endif
+# elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+#  define GHASH_ASM_PPC
+#  define GCM_FUNCREF_4BIT
+extern int OPENSSL_ppccap_P;
+void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
+void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
+void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
+                  size_t len);
+# elif defined(_TMS320C6400_PLUS)
+#   define GHASH_ASM_C64Xplus
 # endif
 #endif

@ -738,14 +757,38 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 	ctx->ghash = gcm_ghash_4bit;
 #  endif
 # elif	defined(GHASH_ASM_ARM)
-	if (OPENSSL_armcap_P & ARMV7_NEON) {
+#  ifdef PMULL_CAPABLE
+	if (PMULL_CAPABLE) {
+		gcm_init_v8(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_v8;
+		ctx->ghash = gcm_ghash_v8;
+	} else
+#  endif
+#  ifdef NEON_CAPABLE
+	if (NEON_CAPABLE) {
 		ctx->gmult = gcm_gmult_neon;
 		ctx->ghash = gcm_ghash_neon;
-	} else {
+	} else
+#  endif
+	{
 		gcm_init_4bit(ctx->Htable,ctx->H.u);
 		ctx->gmult = gcm_gmult_4bit;
 		ctx->ghash = gcm_ghash_4bit;
 	}
+# elif defined(GHASH_ASM_PPC)
+	if (OPENSSL_ppccap_P & (1<<2)) {
+		gcm_init_p8(ctx->Htable, ctx->H.u);
+		ctx->gmult = gcm_gmult_p8;
+		ctx->ghash = gcm_ghash_p8;
+	} else {
+		gcm_init_4bit(ctx->Htable, ctx->H.u);
+		ctx->gmult = gcm_gmult_4bit;
+		ctx->ghash = gcm_ghash_4bit;
+	}
+# elif defined(GHASH_ASM_C64Xplus)
+	/* C64x+ assembler doesn't use tables, skip gcm_init_4bit.
+	 * This is likely to trigger "function never referenced"
+	 * warning and code being eliminated. */
 # else
 	gcm_init_4bit(ctx->Htable,ctx->H.u);
 # endif
--- a/crypto/modes/modes_lcl.h
+++ b/crypto/modes/modes_lcl.h
@ -26,13 +26,16 @@ typedef unsigned int u32;
 typedef unsigned char u8;

 #define STRICT_ALIGNMENT 1
-#if defined(__i386)	|| defined(__i386__)	|| \
-    defined(__x86_64)	|| defined(__x86_64__)	|| \
-    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
-    defined(__s390__)	|| defined(__s390x__)	|| \
-    ( (defined(__arm__)	|| defined(__arm)) && \
-      (defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__) || \
-       defined(__ARM_ARCH_7R__)	|| defined(__ARM_ARCH_7M__)) )
+#if defined(__i386)     || defined(__i386__)	|| \
+    defined(__x86_64)   || defined(__x86_64__)	|| \
+    defined(_M_IX86)    || defined(_M_AMD64)	|| defined(_M_X64) || \
+    defined(__s390__)   || defined(__s390x__)	|| \
+    ( \
+    ( (defined(__arm__) || defined(__arm)) && \
+      (defined(__ARM_ARCH_7__)  || defined(__ARM_ARCH_7A__) || \
+       defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)) ) && \
+    !( defined(__arm__) && defined(__APPLE__) ) \
+    )
 # undef STRICT_ALIGNMENT
 #endif

--- a/crypto/perlasm/arm-xlate.pl
+++ b/crypto/perlasm/arm-xlate.pl
@ -0,0 +1,165 @@
+#!/usr/bin/env perl
+
+# ARM assembler distiller by <appro>.
+
+my $flavour = shift;
+my $output = shift;
+open STDOUT,">$output" || die "can't open $output: $!";
+
+$flavour = "linux32" if (!$flavour or $flavour eq "void");
+
+my %GLOBALS;
+my $dotinlocallabels=($flavour=~/linux/)?1:0;
+
+################################################################
+# directives which need special treatment on different platforms
+################################################################
+my $arch = sub {
+    if ($flavour =~ /linux/)	{ ".arch\t".join(',',@_); }
+    else			{ ""; }
+};
+my $fpu = sub {
+    if ($flavour =~ /linux/)	{ ".fpu\t".join(',',@_); }
+    else			{ ""; }
+};
+my $hidden = sub {
+    if ($flavour =~ /ios/)	{ ".private_extern\t".join(',',@_); }
+    else			{ ".hidden\t".join(',',@_); }
+};
+my $comm = sub {
+    my @args = split(/,\s*/,shift);
+    my $name = @args[0];
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    if ($flavour =~ /ios32/)	{
+	$ret = ".comm\t_$name,@args[1]\n";
+	$ret .= ".non_lazy_symbol_pointer\n";
+	$ret .= "$name:\n";
+	$ret .= ".indirect_symbol\t_$name\n";
+	$ret .= ".long\t0";
+	$name = "_$name";
+    } else			{ $ret = ".comm\t".join(',',@args); }
+
+    $$global = $name;
+    $ret;
+};
+my $globl = sub {
+    my $name = shift;
+    my $global = \$GLOBALS{$name};
+    my $ret;
+
+    SWITCH: for ($flavour) {
+	/ios/		&& do { $name = "_$name";
+				last;
+			      };
+    }
+
+    $ret = ".globl	$name" if (!$ret);
+    $$global = $name;
+    $ret;
+};
+my $global = $globl;
+my $extern = sub {
+    &$globl(@_);
+    return;	# return nothing
+};
+my $type = sub {
+    if ($flavour =~ /linux/)	{ ".type\t".join(',',@_); }
+    else			{ ""; }
+};
+my $size = sub {
+    if ($flavour =~ /linux/)	{ ".size\t".join(',',@_); }
+    else			{ ""; }
+};
+my $inst = sub {
+    if ($flavour =~ /linux/)    { ".inst\t".join(',',@_); }
+    else                        { ".long\t".join(',',@_); }
+};
+my $asciz = sub {
+    my $line = join(",",@_);
+    if ($line =~ /^"(.*)"$/)
+    {	".byte	" . join(",",unpack("C*",$1),0) . "\n.align	2";	}
+    else
+    {	"";	}
+};
+
+sub range {
+  my ($r,$sfx,$start,$end) = @_;
+
+    join(",",map("$r$_$sfx",($start..$end)));
+}
+
+sub expand_line {
+  my $line = shift;
+  my @ret = ();
+
+    pos($line)=0;
+
+    while ($line =~ m/\G[^@\/\{\"]*/g) {
+	if ($line =~ m/\G(@|\/\/|$)/gc) {
+	    last;
+	}
+	elsif ($line =~ m/\G\{/gc) {
+	    my $saved_pos = pos($line);
+	    $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e;
+	    pos($line) = $saved_pos;
+	    $line =~ m/\G[^\}]*\}/g;
+	}
+	elsif ($line =~ m/\G\"/gc) {
+	    $line =~ m/\G[^\"]*\"/g;
+	}
+    }
+
+    $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
+
+    return $line;
+}
+
+while($line=<>) {
+
+    if ($line =~ m/^\s*(#|@|\/\/)/)	{ print $line; next; }
+
+    $line =~ s|/\*.*\*/||;	# get rid of C-style comments...
+    $line =~ s|^\s+||;		# ... and skip white spaces in beginning...
+    $line =~ s|\s+$||;		# ... and at the end
+
+    {
+	$line =~ s|[\b\.]L(\w{2,})|L$1|g;	# common denominator for Locallabel
+	$line =~ s|\bL(\w{2,})|\.L$1|g	if ($dotinlocallabels);
+    }
+
+    {
+	$line =~ s|(^[\.\w]+)\:\s*||;
+	my $label = $1;
+	if ($label) {
+	    printf "%s:",($GLOBALS{$label} or $label);
+	}
+    }
+
+    if ($line !~ m/^[#@]/) {
+	$line =~ s|^\s*(\.?)(\S+)\s*||;
+	my $c = $1; $c = "\t" if ($c eq "");
+	my $mnemonic = $2;
+	my $opcode;
+	if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) {
+	    $opcode = eval("\$$1_$2");
+	} else {
+	    $opcode = eval("\$$mnemonic");
+	}
+
+	my $arg=expand_line($line);
+
+	if (ref($opcode) eq 'CODE') {
+		$line = &$opcode($arg);
+	} elsif ($mnemonic)         {
+		$line = $c.$mnemonic;
+		$line.= "\t$arg" if ($arg);
+	}
+    }
+
+    print $line if ($line);
+    print "\n";
+}
+
+close STDOUT;
--- a/crypto/perlasm/ppc-xlate.pl
+++ b/crypto/perlasm/ppc-xlate.pl
@ -27,7 +27,8 @@ my $globl = sub {
 	/osx/		&& do { $name = "_$name";
 				last;
 			      };
-	/linux.*32/	&& do {	$ret .= ".globl	$name\n";
+	/linux.*(32|64le)/
+			&& do {	$ret .= ".globl	$name\n";
 				$ret .= ".type	$name,\@function";
 				last;
 			      };
@ -37,7 +38,6 @@ my $globl = sub {
 				$ret .= ".align	3\n";
 				$ret .= "$name:\n";
 				$ret .= ".quad	.$name,.TOC.\@tocbase,0\n";
-				$ret .= ".size	$name,24\n";
 				$ret .= ".previous\n";

 				$name = ".$name";
@ -50,7 +50,9 @@ my $globl = sub {
    $ret;
 };
 my $text = sub {
-    ($flavour =~ /aix/) ? ".csect" : ".text";
+    my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text";
+    $ret = ".abiversion	2\n".$ret	if ($flavour =~ /linux.*64le/);
+    $ret;
 };
 my $machine = sub {
    my $junk = shift;
@ -62,9 +64,12 @@ my $machine = sub {
    ".machine	$arch";
 };
 my $size = sub {
-    if ($flavour =~ /linux.*32/)
+    if ($flavour =~ /linux/)
    {	shift;
-	".size	" . join(",",@_);
+	my $name = shift; $name =~ s|^[\.\_]||;
+	my $ret  = ".size	$name,.-".($flavour=~/64$/?".":"").$name;
+	$ret .= "\n.size	.$name,.-.$name" if ($flavour=~/64$/);
+	$ret;
    }
    else
    {	"";	}
@ -77,6 +82,25 @@ my $asciz = sub {
    else
    {	"";	}
 };
+my $quad = sub {
+    shift;
+    my @ret;
+    my ($hi,$lo);
+    for (@_) {
+	if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io)
+	{  $hi=$1?"0x$1":"0"; $lo="0x$2";  }
+	elsif (/^([0-9]+)$/o)
+	{  $hi=$1>>32; $lo=$1&0xffffffff;  } # error-prone with 32-bit perl
+	else
+	{  $hi=undef; $lo=$_; }
+
+	if (defined($hi))
+	{  push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo");  }
+	else
+	{  push(@ret,".quad	$lo");  }
+    }
+    join("\n",@ret);
+};

 ################################################################
 # simplified mnemonics not handled by at least one assembler
@ -122,6 +146,66 @@ my $extrdi = sub {
    $b = ($b+$n)&63; $n = 64-$n;
    "	rldicl	$ra,$rs,$b,$n";
 };
+my $vmr = sub {
+    my ($f,$vx,$vy) = @_;
+    "	vor	$vx,$vy,$vy";
+};
+
+# Some ABIs specify vrsave, special-purpose register #256, as reserved
+# for system use.
+my $no_vrsave = ($flavour =~ /aix|linux64le/);
+my $mtspr = sub {
+    my ($f,$idx,$ra) = @_;
+    if ($idx == 256 && $no_vrsave) {
+	"	or	$ra,$ra,$ra";
+    } else {
+	"	mtspr	$idx,$ra";
+    }
+};
+my $mfspr = sub {
+    my ($f,$rd,$idx) = @_;
+    if ($idx == 256 && $no_vrsave) {
+	"	li	$rd,-1";
+    } else {
+	"	mfspr	$rd,$idx";
+    }
+};
+
+# PowerISA 2.06 stuff
+sub vsxmem_op {
+    my ($f, $vrt, $ra, $rb, $op) = @_;
+    "	.long	".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1);
+}
+# made-up unaligned memory reference AltiVec/VMX instructions
+my $lvx_u	= sub {	vsxmem_op(@_, 844); };	# lxvd2x
+my $stvx_u	= sub {	vsxmem_op(@_, 972); };	# stxvd2x
+my $lvdx_u	= sub {	vsxmem_op(@_, 588); };	# lxsdx
+my $stvdx_u	= sub {	vsxmem_op(@_, 716); };	# stxsdx
+my $lvx_4w	= sub { vsxmem_op(@_, 780); };	# lxvw4x
+my $stvx_4w	= sub { vsxmem_op(@_, 908); };	# stxvw4x
+
+# PowerISA 2.07 stuff
+sub vcrypto_op {
+    my ($f, $vrt, $vra, $vrb, $op) = @_;
+    "	.long	".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
+}
+my $vcipher	= sub { vcrypto_op(@_, 1288); };
+my $vcipherlast	= sub { vcrypto_op(@_, 1289); };
+my $vncipher	= sub { vcrypto_op(@_, 1352); };
+my $vncipherlast= sub { vcrypto_op(@_, 1353); };
+my $vsbox	= sub { vcrypto_op(@_, 0, 1480); };
+my $vshasigmad	= sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); };
+my $vshasigmaw	= sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); };
+my $vpmsumb	= sub { vcrypto_op(@_, 1032); };
+my $vpmsumd	= sub { vcrypto_op(@_, 1224); };
+my $vpmsubh	= sub { vcrypto_op(@_, 1096); };
+my $vpmsumw	= sub { vcrypto_op(@_, 1160); };
+my $vaddudm	= sub { vcrypto_op(@_, 192);  };
+
+my $mtsle	= sub {
+    my ($f, $arg) = @_;
+    "	.long	".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2);
+};

 while($line=<>) {

@ -138,7 +222,10 @@ while($line=<>) {
    {
 	$line =~ s|(^[\.\w]+)\:\s*||;
 	my $label = $1;
-	printf "%s:",($GLOBALS{$label} or $label) if ($label);
+	if ($label) {
+	    printf "%s:",($GLOBALS{$label} or $label);
+	    printf "\n.localentry\t$GLOBALS{$label},0"	if ($GLOBALS{$label} && $flavour =~ /linux.*64le/);
+	}
    }

    {
@ -147,7 +234,7 @@ while($line=<>) {
 	my $mnemonic = $2;
 	my $f = $3;
 	my $opcode = eval("\$$mnemonic");
-	$line =~ s|\bc?[rf]([0-9]+)\b|$1|g if ($c ne "." and $flavour !~ /osx/);
+	$line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/);
 	if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); }
 	elsif ($mnemonic)           { $line = $c.$mnemonic.$f."\t".$line; }
    }
--- a/crypto/perlasm/x86gas.pl
+++ b/crypto/perlasm/x86gas.pl
@ -45,10 +45,8 @@ sub ::generic
    undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);

    if ($#_==0)				{ &::emit($opcode);		}
-    elsif ($opcode =~ m/^j/o && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode eq "call" && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode eq "clflush" && $#_==1){ &::emit($opcode,@arg);	}
-    elsif ($opcode =~ m/^set/&& $#_==1)	{ &::emit($opcode,@arg);	}
+    elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o)
+					{ &::emit($opcode,@arg);	}
    else				{ &::emit($opcode.$suffix,@arg);}

  1;
--- a/crypto/ppccap.c
+++ b/crypto/ppccap.c
@ -3,13 +3,24 @@
 #include <string.h>
 #include <setjmp.h>
 #include <signal.h>
+#include <unistd.h>
+#if defined(__linux) || defined(_AIX)
+# include <sys/utsname.h>
+#endif
+#if defined(_AIX53)     /* defined even on post-5.3 */
+# include <sys/systemcfg.h>
+# if !defined(__power_set)
+#  define __power_set(a) (_system_configuration.implementation & (a))
+# endif
+#endif
 #include <crypto.h>
 #include <openssl/bn.h>

 #define PPC_FPU64	(1<<0)
 #define PPC_ALTIVEC	(1<<1)
+#define PPC_CRYPTO207	(1<<2)

-static int OPENSSL_ppccap_P = 0;
+int OPENSSL_ppccap_P = 0;

 static sigset_t all_masked;

@ -49,10 +60,28 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
 	}
 #endif

+void sha256_block_p8(void *ctx, const void *inp, size_t len);
+void sha256_block_ppc(void *ctx, const void *inp, size_t len);
+void sha256_block_data_order(void *ctx, const void *inp, size_t len)
+{
+    OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha256_block_p8(ctx, inp, len) :
+        sha256_block_ppc(ctx, inp, len);
+}
+
+void sha512_block_p8(void *ctx, const void *inp, size_t len);
+void sha512_block_ppc(void *ctx, const void *inp, size_t len);
+void sha512_block_data_order(void *ctx, const void *inp, size_t len)
+{
+    OPENSSL_ppccap_P & PPC_CRYPTO207 ? sha512_block_p8(ctx, inp, len) :
+        sha512_block_ppc(ctx, inp, len);
+}
+
 static sigjmp_buf ill_jmp;
 static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }

 void OPENSSL_ppc64_probe(void);
+void OPENSSL_altivec_probe(void);
+void OPENSSL_crypto207_probe(void);

 void OPENSSL_cpuid_setup(void)
 	{
@ -82,6 +111,45 @@ void OPENSSL_cpuid_setup(void)

 	OPENSSL_ppccap_P = 0;

+#if defined(_AIX)
+	if (sizeof(size_t) == 4) {
+		struct utsname uts;
+# if defined(_SC_AIX_KERNEL_BITMODE)
+		if (sysconf(_SC_AIX_KERNEL_BITMODE) != 64)
+			return;
+# endif
+		if (uname(&uts) != 0 || atoi(uts.version) < 6)
+			return;
+	}
+
+# if defined(__power_set)
+	/*
+	 * Value used in __power_set is a single-bit 1<<n one denoting
+	 * specific processor class. Incidentally 0xffffffff<<n can be
+	 * used to denote specific processor and its successors.
+	 */
+	if (sizeof(size_t) == 4) {
+		/* In 32-bit case PPC_FPU64 is always fastest [if option] */
+		if (__power_set(0xffffffffU<<13))       /* POWER5 and later */
+			OPENSSL_ppccap_P |= PPC_FPU64;
+	} else {
+		/* In 64-bit case PPC_FPU64 is fastest only on POWER6 */
+#  if 0		/* to keep compatibility with previous validations */
+		if (__power_set(0x1U<<14))              /* POWER6 */
+			OPENSSL_ppccap_P |= PPC_FPU64;
+#  endif
+	}
+
+	if (__power_set(0xffffffffU<<14))           /* POWER6 and later */
+		OPENSSL_ppccap_P |= PPC_ALTIVEC;
+
+	if (__power_set(0xffffffffU<<16))           /* POWER8 and later */
+		OPENSSL_ppccap_P |= PPC_CRYPTO207;
+
+	return;
+# endif
+#endif
+
 	memset(&ill_act,0,sizeof(ill_act));
 	ill_act.sa_handler = ill_handler;
 	ill_act.sa_mask    = all_masked;
@ -108,6 +176,11 @@ void OPENSSL_cpuid_setup(void)
 		{
 		OPENSSL_altivec_probe();
 		OPENSSL_ppccap_P |= PPC_ALTIVEC;
+		if (sigsetjmp(ill_jmp, 1) == 0)
+			{
+			OPENSSL_crypto207_probe();
+			OPENSSL_ppccap_P |= PPC_CRYPTO207;
+			}
 		}

 	sigaction (SIGILL,&ill_oact,NULL);
--- a/crypto/ppccpuid.pl
+++ b/crypto/ppccpuid.pl
@ -40,6 +40,16 @@ $code=<<___;
 	.long	0
 	.byte	0,12,0x14,0,0,0,0,0

+.globl	.OPENSSL_crypto207_probe
+.align	4
+.OPENSSL_crypto207_probe:
+	.long	0x7C000E99	# lvx_u		v0,0,r1
+	.long	0x10000508	# vcipher	v0,v0,v0
+	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+.size	.OPENSSL_crypto207_probe,.-.OPENSSL_crypto207_probe
+
 .globl	.OPENSSL_wipe_cpu
 .align	4
 .OPENSSL_wipe_cpu:
--- a/crypto/sha/Makefile
+++ b/crypto/sha/Makefile
@ -73,6 +73,8 @@ sha512-sparcv9.s:asm/sha512-sparcv9.pl;	$(PERL) asm/sha512-sparcv9.pl $@ $(CFLAG
 sha1-ppc.s:	asm/sha1-ppc.pl;	$(PERL) asm/sha1-ppc.pl $(PERLASM_SCHEME) $@
 sha256-ppc.s:	asm/sha512-ppc.pl;	$(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
 sha512-ppc.s:	asm/sha512-ppc.pl;	$(PERL) asm/sha512-ppc.pl $(PERLASM_SCHEME) $@
+sha256p8-ppc.s:	asm/sha512p8-ppc.pl;	$(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@
+sha512p8-ppc.s:	asm/sha512p8-ppc.pl;	$(PERL) asm/sha512p8-ppc.pl $(PERLASM_SCHEME) $@

 sha1-parisc.s:	asm/sha1-parisc.pl;	$(PERL) asm/sha1-parisc.pl $(PERLASM_SCHEME) $@
 sha256-parisc.s:asm/sha512-parisc.pl;	$(PERL) asm/sha512-parisc.pl $(PERLASM_SCHEME) $@
@ -90,6 +92,9 @@ sha512-%.S:	asm/sha512-%.pl;	$(PERL) $< $(PERLASM_SCHEME) $@
 sha1-armv4-large.o:	sha1-armv4-large.S
 sha256-armv4.o:		sha256-armv4.S
 sha512-armv4.o:		sha512-armv4.S
+sha1-armv8.o:		sha1-armv8.S
+sha256-armv8.o:		sha256-armv8.S
+sha512-armv8.o:		sha512-armv8.S

 files:
 	$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
--- a/crypto/sha/asm/sha1-armv4-large.pl
+++ b/crypto/sha/asm/sha1-armv4-large.pl
@ -52,8 +52,20 @@
 # Profiler-assisted and platform-specific optimization resulted in 10%
 # improvement on Cortex A8 core and 12.2 cycles per byte.

-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}

 $ctx="r0";
 $inp="r1";
@ -177,6 +189,7 @@ for($i=0;$i<5;$i++) {
 $code.=<<___;
 	teq	$Xi,sp
 	bne	.L_00_15		@ [((11+4)*5+2)*3]
+	sub	sp,sp,#25*4
 ___
 	&BODY_00_15(@V);	unshift(@V,pop(@V));
 	&BODY_16_19(@V);	unshift(@V,pop(@V));
@ -186,7 +199,6 @@ ___
 $code.=<<___;

 	ldr	$K,.LK_20_39		@ [+15+16*4]
-	sub	sp,sp,#25*4
 	cmn	sp,#0			@ [+3], clear carry to denote 20_39
 .L_20_39_or_60_79:
 ___
--- a/crypto/sha/asm/sha1-armv8.pl
+++ b/crypto/sha/asm/sha1-armv8.pl
@ -0,0 +1,343 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		hardware-assisted	software(*)
+# Apple A7	2.31			4.13 (+14%)
+# Cortex-A53	2.19			8.73 (+108%)
+# Cortex-A57	2.35			7.88 (+74%)
+#
+# (*)	Software results are presented mostly for reference purposes.
+
+$flavour = shift;
+$output  = shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($ctx,$inp,$num)=("x0","x1","x2");
+@Xw=map("w$_",(3..17,19));
+@Xx=map("x$_",(3..17,19));
+@V=($A,$B,$C,$D,$E)=map("w$_",(20..24));
+($t0,$t1,$t2,$K)=map("w$_",(25..28));
+
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i<15 && !($i&1));
+	lsr	@Xx[$i+1],@Xx[$i],#32
+___
+$code.=<<___ if ($i<14 && !($i&1));
+	ldr	@Xx[$i+2],[$inp,#`($i+2)*4-64`]
+___
+$code.=<<___ if ($i<14 && ($i&1));
+#ifdef	__ARMEB__
+	ror	@Xx[$i+1],@Xx[$i+1],#32
+#else
+	rev32	@Xx[$i+1],@Xx[$i+1]
+#endif
+___
+$code.=<<___ if ($i<14);
+	bic	$t0,$d,$b
+	and	$t1,$c,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	orr	$t0,$t0,$t1
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+$code.=<<___ if ($i==19);
+	movz	$K,#0xeba1
+	movk	$K,#0x6ed9,lsl#16
+___
+$code.=<<___ if ($i>=14);
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	bic	$t0,$d,$b
+	and	$t1,$c,$b
+	ror	$t2,$a,#27
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	add	$d,$d,$K		// future e+=K
+	orr	$t0,$t0,$t1
+	add	$e,$e,$t2		// e+=rot(a,5)
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==59);
+	movz	$K,#0xc1d6
+	movk	$K,#0xca62,lsl#16
+___
+$code.=<<___;
+	orr	$t0,$b,$c
+	and	$t1,$b,$c
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	ror	$t2,$a,#27
+	and	$t0,$t0,$d
+	add	$d,$d,$K		// future e+=K
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	add	$e,$e,$t2		// e+=rot(a,5)
+	orr	$t0,$t0,$t1
+	ror	$b,$b,#2
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=($i+2)&15;
+
+$code.=<<___ if ($i==39);
+	movz	$K,#0xbcdc
+	movk	$K,#0x8f1b,lsl#16
+___
+$code.=<<___ if ($i<78);
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+2)&15]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+8)&15]
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	 eor	@Xw[$j],@Xw[$j],@Xw[($j+13)&15]
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+	 ror	@Xw[$j],@Xw[$j],#31
+___
+$code.=<<___ if ($i==78);
+	ldp	@Xw[1],@Xw[2],[$ctx]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	add	$d,$d,$K		// future e+=K
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	add	$d,$d,@Xw[($i+1)&15]	// future e+=X[i]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+$code.=<<___ if ($i==79);
+	ldp	@Xw[3],@Xw[4],[$ctx,#8]
+	eor	$t0,$d,$b
+	ror	$t2,$a,#27
+	eor	$t0,$t0,$c
+	add	$e,$e,$t2		// e+=rot(a,5)
+	ror	$b,$b,#2
+	ldr	@Xw[5],[$ctx,#16]
+	add	$e,$e,$t0		// e+=F(b,c,d)
+___
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.extern	OPENSSL_armcap_P
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,%function
+.align	6
+sha1_block_data_order:
+	ldr	x16,.LOPENSSL_armcap_P
+	adr	x17,.LOPENSSL_armcap_P
+	add	x16,x16,x17
+	ldr	w16,[x16]
+	tst	w16,#ARMV8_SHA1
+	b.ne	.Lv8_entry
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	$A,$B,[$ctx]
+	ldp	$C,$D,[$ctx,#8]
+	ldr	$E,[$ctx,#16]
+
+.Loop:
+	ldr	@Xx[0],[$inp],#64
+	movz	$K,#0x7999
+	sub	$num,$num,#1
+	movk	$K,#0x5a82,lsl#16
+#ifdef	__ARMEB__
+	ror	$Xx[0],@Xx[0],#32
+#else
+	rev32	@Xx[0],@Xx[0]
+#endif
+	add	$E,$E,$K		// warm it up
+	add	$E,$E,@Xw[0]
+___
+for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	add	$B,$B,@Xw[2]
+	add	$C,$C,@Xw[3]
+	add	$A,$A,@Xw[1]
+	add	$D,$D,@Xw[4]
+	add	$E,$E,@Xw[5]
+	stp	$A,$B,[$ctx]
+	stp	$C,$D,[$ctx,#8]
+	str	$E,[$ctx,#16]
+	cbnz	$num,.Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+.size	sha1_block_data_order,.-sha1_block_data_order
+___
+{{{
+my ($ABCD,$E,$E0,$E1)=map("v$_.16b",(0..3));
+my @MSG=map("v$_.16b",(4..7));
+my @Kxx=map("v$_.4s",(16..19));
+my ($W0,$W1)=("v20.4s","v21.4s");
+my $ABCD_SAVE="v22.16b";
+
+$code.=<<___;
+.type	sha1_block_armv8,%function
+.align	6
+sha1_block_armv8:
+.Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adr	x4,.Lconst
+	eor	$E,$E,$E
+	ld1.32	{$ABCD},[$ctx],#16
+	ld1.32	{$E}[0],[$ctx]
+	sub	$ctx,$ctx,#16
+	ld1.32	{@Kxx[0]-@Kxx[3]},[x4]
+
+.Loop_hw:
+	ld1	{@MSG[0]-@MSG[3]},[$inp],#64
+	sub	$num,$num,#1
+	rev32	@MSG[0],@MSG[0]
+	rev32	@MSG[1],@MSG[1]
+
+	add.i32	$W0,@Kxx[0],@MSG[0]
+	rev32	@MSG[2],@MSG[2]
+	orr	$ABCD_SAVE,$ABCD,$ABCD	// offload
+
+	add.i32	$W1,@Kxx[0],@MSG[1]
+	rev32	@MSG[3],@MSG[3]
+	sha1h	$E1,$ABCD
+	sha1c	$ABCD,$E,$W0		// 0
+	add.i32	$W0,@Kxx[$j],@MSG[2]
+	sha1su0	@MSG[0],@MSG[1],@MSG[2]
+___
+for ($j=0,$i=1;$i<20-3;$i++) {
+my $f=("c","p","m","p")[$i/5];
+$code.=<<___;
+	sha1h	$E0,$ABCD		// $i
+	sha1$f	$ABCD,$E1,$W1
+	add.i32	$W1,@Kxx[$j],@MSG[3]
+	sha1su1	@MSG[0],@MSG[3]
+___
+$code.=<<___ if ($i<20-4);
+	sha1su0	@MSG[1],@MSG[2],@MSG[3]
+___
+	($E0,$E1)=($E1,$E0);		($W0,$W1)=($W1,$W0);
+	push(@MSG,shift(@MSG));		$j++ if ((($i+3)%5)==0);
+}
+$code.=<<___;
+	sha1h	$E0,$ABCD		// $i
+	sha1p	$ABCD,$E1,$W1
+	add.i32	$W1,@Kxx[$j],@MSG[3]
+
+	sha1h	$E1,$ABCD		// 18
+	sha1p	$ABCD,$E0,$W0
+
+	sha1h	$E0,$ABCD		// 19
+	sha1p	$ABCD,$E1,$W1
+
+	add.i32	$E,$E,$E0
+	add.i32	$ABCD,$ABCD,$ABCD_SAVE
+
+	cbnz	$num,.Loop_hw
+
+	st1.32	{$ABCD},[$ctx],#16
+	st1.32	{$E}[0],[$ctx]
+
+	ldr	x29,[sp],#16
+	ret
+.size	sha1_block_armv8,.-sha1_block_armv8
+.align	6
+.Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+.LOPENSSL_armcap_P:
+.quad	OPENSSL_armcap_P-.
+.asciz	"SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+.comm	OPENSSL_armcap_P,4,4
+___
+}}}
+
+{   my	%opcode = (
+	"sha1c"		=> 0x5e000000,	"sha1p"		=> 0x5e001000,
+	"sha1m"		=> 0x5e002000,	"sha1su0"	=> 0x5e003000,
+	"sha1h"		=> 0x5e280800,	"sha1su1"	=> 0x5e281800	);
+
+    sub unsha1 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b(sha1\w+)\s+([qv].*)/unsha1($1,$2)/geo;
+
+	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
+	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
+
+	print $_,"\n";
+}
+
+close STDOUT;
--- a/crypto/sha/asm/sha1-c64x-large.pl
+++ b/crypto/sha/asm/sha1-c64x-large.pl
@ -0,0 +1,230 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x.
+#
+# November 2016
+#
+# This is fully-unrolled SHA1 implementation. It's 25% faster than
+# one with compact loops, doesn't use in-memory ring buffer, as
+# everything is accomodated in registers, and has "perfect" interrupt
+# agility. Drawback is obviously the code size...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$K) = map("A$_",(16..20, 21..24));
+@V = ($A,$B,$C,$D,$E);
+@X = map("B$_",(16..31));
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___				if ($i<14);
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 LDNW	*${INP}++,@X[$i+2]
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	 SWAP2	@X[$i+1],@X[$i+1]
+||	ADD	@X[$i],$e,$e		; E+=X[i]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 SWAP4	@X[$i+1],@X[$i+1]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+$code.=<<___				if ($i==14);
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i],$e,$e		; E+=X[i]
+||	 SWAP2	@X[$i+1],@X[$i+1]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 SWAP4	@X[$i+1],@X[$i+1]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+$code.=<<___				if ($i==15);
+||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i],$e,$e		; E+=X[i]
+||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1,@X[$j]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+$code.=<<___				if ($i>15);
+||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	ANDN	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
+	OR	$F0,$F,$F		; F_00_19(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1,@X[$j]
+	ADD	$F,$e,$e		; E+=F_00_19(B,C,D)
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___				if ($i<79);
+||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	XOR	$c,$b,$F
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
+	XOR	$d,$F,$F		; F_20_39(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1,@X[$j]
+	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
+___
+$code.=<<___				if ($i==79);
+|| [A0]	B	loop?
+|| [A0]	LDNW	*${INP}++,@X[0]		; pre-fetch input
+	ROTL	$a,5,$Arot		;; $i
+||	XOR	$c,$b,$F
+||	ADD	$K,$e,$e		; E+=K
+|| [A0]	LDNW	*${INP}++,@X[1]
+	XOR	$d,$F,$F		; F_20_39(B,C,D)
+||	ROTL	$b,30,$b
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+	ADD	$Arot,$e,$e		; E+=rot(A,5)
+	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
+||	ADD	$Bctx,$a,$a		; accumulate context
+||	ADD	$Cctx,$b,$b
+	ADD	$Dctx,$c,$c
+||	ADD	$Ectx,$d,$d
+||	ADD	$Actx,$e,$e
+;;===== branch to loop? is taken here
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e) = @_;
+my $j = ($i+1)&15;
+
+$code.=<<___;
+||	 XOR	@X[($j+2)&15],@X[$j],@X[$j]
+	ROTL	$a,5,$Arot		;; $i
+||	AND	$c,$b,$F
+||	AND	$d,$b,$F0
+||	ADD	$K,$e,$e		; E+=K
+||	 XOR	@X[($j+8)&15],@X[$j],@X[$j]
+	XOR	$F0,$F,$F
+||	AND	$c,$d,$F0
+||	ROTL	$b,30,$b
+||	 XOR	@X[($j+13)&15],@X[$j],@X[$j]
+||	ADD	@X[$i&15],$e,$e		; E+=X[i]
+	XOR	$F0,$F,$F		; F_40_59(B,C,D)
+||	ADD	$Arot,$e,$e		; E+=rot(A,5)
+||	 ROTL	@X[$j],1,@X[$j]
+	ADD	$F,$e,$e		; E+=F_20_39(B,C,D)
+___
+}
+
+$code=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	sha1_block_data_order,_sha1_block_data_order
+	.endif
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	MV,SWAP2
+	.asg	MV,SWAP4
+	.endif
+
+	.global	_sha1_block_data_order
+_sha1_block_data_order:
+	.asmfunc
+	MV	$NUM,A0			; reassign $NUM
+  [!A0]	BNOP	RA			; if ($NUM==0) return;
+|| [A0]	LDW	*${CTX}[0],$A		; load A-E...
+   [A0]	LDW	*${CTX}[1],$B
+   [A0]	LDW	*${CTX}[2],$C
+   [A0]	LDW	*${CTX}[3],$D
+   [A0]	LDW	*${CTX}[4],$E
+   [A0]	LDNW	*${INP}++,@X[0]		; pre-fetch input
+   [A0]	LDNW	*${INP}++,@X[1]
+	NOP	3
+
+loop?:
+	SUB	A0,1,A0
+||	MV	$A,$Actx
+||	MVD	$B,$Bctx
+||	SWAP2	@X[0],@X[0]
+||	MVKL	0x5a827999,$K
+	MVKH	0x5a827999,$K		; K_00_19
+||	MV	$C,$Cctx
+||	MV	$D,$Dctx
+||	MVD	$E,$Ectx
+||	SWAP4	@X[0],@X[0]
+___
+for ($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||	MVKL	0x6ed9eba1,$K
+	MVKH	0x6ed9eba1,$K		; K_20_39
+___
+for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||	MVKL	0x8f1bbcdc,$K
+	MVKH	0x8f1bbcdc,$K		; K_40_59
+___
+for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+||	MVKL	0xca62c1d6,$K
+	MVKH	0xca62c1d6,$K		; K_60_79
+___
+for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	BNOP	RA			; return
+	STW	$A,*${CTX}[0]		; emit A-E...
+	STW	$B,*${CTX}[1]
+	STW	$C,*${CTX}[2]
+	STW	$D,*${CTX}[3]
+	STW	$E,*${CTX}[4]
+	.endasmfunc
+
+	.sect	.const
+	.cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
--- a/crypto/sha/asm/sha1-c64x.pl
+++ b/crypto/sha/asm/sha1-c64x.pl
@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x.
+#
+# November 2016
+#
+# If compared to compiler-generated code with similar characteristics,
+# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
+# this implementation is 25% smaller and >2x faster. In absolute terms
+# performance is (quite impressive) ~6.5 cycles per processed byte.
+# Unlike its predecessor, sha1-c64xplus module, this module has worse
+# interrupt agility. While original added up to 5 cycles delay to
+# response to interrupt, this module adds up to 100. Fully unrolled
+# implementation doesn't add any delay and even 25% faster, but is
+# almost 5x larger...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
+($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
+($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
+($XPA,$XPB) = ("A5","B5");			# X circular buffer
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM
+
+$code=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.asg	sha1_block_data_order,_sha1_block_data_order
+	.endif
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	MV,SWAP2
+	.asg	MV,SWAP4
+	.endif
+
+	.global	_sha1_block_data_order
+_sha1_block_data_order:
+	.asmfunc stack_usage(64)
+	MV	$NUM,A0			; reassign $NUM
+||	MVK	-64,B0
+  [!A0]	BNOP	RA			; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--[16]		; save frame pointer and alloca(64)
+|| [A0]	MV	SP,FP
+   [A0]	LDW	*${CTX}[0],$A		; load A-E...
+|| [A0]	AND	B0,SP,SP		; align stack at 64 bytes
+   [A0]	LDW	*${CTX}[1],$B
+|| [A0]	SUBAW	SP,2,SP			; reserve two words above buffer
+   [A0]	LDW	*${CTX}[2],$C
+|| [A0]	MVK	0x00404,B0
+   [A0]	LDW	*${CTX}[3],$D
+|| [A0]	MVKH	0x50000,B0		; 0x050404, 64 bytes for $XP[AB]
+   [A0]	LDW	*${CTX}[4],$E
+|| [A0]	MVC	B0,AMR			; setup circular addressing
+	LDNW	*${INP}++,$TX1		; pre-fetch input
+	NOP	1
+
+loop?:
+	MVKL	0x5a827999,$K
+||	ADDAW	SP,2,$XPB
+||	SUB	A0,1,A0
+	MVKH	0x5a827999,$K		; K_00_19
+||	MV	$A,$Actx
+||	MV	$B,$Bctx
+;;==================================================
+	B	body_00_13?		; BODY_00_13
+||	MVK	11,B0
+||	MV	$XPB,$XPA
+||	MV	$C,$Cctx
+||	MV	$D,$Dctx
+||	MVD	$E,$Ectx
+
+body_00_13?:
+	ROTL	$A,5,$Arot
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+||	LDNW	*${INP}++,$TX1
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX3		; byte swap
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+
+	ADD	$TX3,$T,$A		; A=T+Xi
+||	STW	$TX3,*${XPB}++
+||	BDEC	body_00_13?,B0
+;;==================================================
+	ROTL	$A,5,$Arot		; BODY_14
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+||	LDNW	*${INP}++,$TX1
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX2		; byte swap
+||	LDW	*${XPA}++,$X0		; fetches from X ring buffer are
+||	LDW	*${XPB}[4],$X2		; 2 iterations ahead
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+;;==================================================
+	ROTL	$A,5,$Arot		; BODY_15
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX2		; byte swap
+||	XOR	$X0,$X2,$TX0		; Xupdate XORs are 1 iteration ahead
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+;;==================================================
+||	B	body_16_19?		; BODY_16_19
+||	MVK	1,B0
+
+body_16_19?:
+	ROTL	$A,5,$Arot
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+||	BDEC	body_16_19?,B0
+
+	MVKL	0x6ed9eba1,$K
+||	MVK	17,B0
+	MVKH	0x6ed9eba1,$K		; K_20_39
+___
+sub BODY_20_39 {
+my $label = shift;
+$code.=<<___;
+;;==================================================
+||	B	$label			; BODY_20_39
+
+$label:
+	ROTL	$A,5,$Arot
+||	XOR	$B,$C,$F
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$D,$F,$F		; F_20_39(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++		; last one is redundant
+||	XOR	$TX0,$TX1,$TX1
+||	BDEC	$label,B0
+___
+}	&BODY_20_39("body_20_39?");
+$code.=<<___;
+;;==================================================
+	MVKL	0x8f1bbcdc,$K
+||	MVK	17,B0
+	MVKH	0x8f1bbcdc,$K		; K_40_59
+||	B	body_40_59?		; BODY_40_59
+||	AND	$B,$C,$F
+||	AND	$B,$D,$F0
+
+body_40_59?:
+	ROTL	$A,5,$Arot
+||	XOR	$F0,$F,$F
+||	AND	$C,$D,$F0
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$F0,$F,$F		; F_40_59(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_40_59(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+||	AND	$B,$C,$F
+||	AND	$B,$D,$F0
+||	BDEC	body_40_59?,B0
+
+	MVKL	0xca62c1d6,$K
+||	MVK	16,B0
+	MVKH	0xca62c1d6,$K		; K_60_79
+___
+	&BODY_20_39("body_60_78?");	# BODY_60_78
+$code.=<<___;
+;;==================================================
+   [A0]	B	loop?
+||	ROTL	$A,5,$Arot		; BODY_79
+||	XOR	$B,$C,$F
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+   [A0]	LDNW	*${INP}++,$TX1		; pre-fetch input
+||	ADD	$K,$E,$T		; T=E+K
+||	XOR	$D,$F,$F		; F_20_39(B,C,D)
+
+	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
+||	ADD	$Ectx,$D,$E		; E=D,E+=Ectx
+||	ADD	$Dctx,$C,$D		; D=C,D+=Dctx
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	ADD	$Bctx,$A,$B		; B=A,B+=Bctx
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+
+	ADD	$Actx,$A,$A		; A+=Actx
+||	ADD	$Cctx,$C,$C		; C+=Cctx
+;; end of loop?
+
+	BNOP	RA			; return
+||	MV	FP,SP			; restore stack pointer
+||	LDW	*FP[0],FP		; restore frame pointer
+	STW	$A,*${CTX}[0]		; emit A-E...
+||	MVK	0,B0
+	STW	$B,*${CTX}[1]
+||	MVC	B0,AMR			; clear AMR
+	STW	$C,*${CTX}[2]
+	STW	$D,*${CTX}[3]
+	STW	$E,*${CTX}[4]
+	.endasmfunc
+
+	.sect	.const
+	.cstring "SHA1 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
--- a/crypto/sha/asm/sha1-c64xplus.pl
+++ b/crypto/sha/asm/sha1-c64xplus.pl
@ -0,0 +1,323 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x+.
+#
+# November 2011
+#
+# If compared to compiler-generated code with similar characteristics,
+# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
+# this implementation is 25% smaller and >2x faster. In absolute terms
+# performance is (quite impressive) ~6.5 cycles per processed byte.
+# Fully unrolled assembler would be ~5x larger and is likely to be
+# ~15% faster. It would be free from references to intermediate ring
+# buffer, but put more pressure on L1P [both because the code would be
+# larger and won't be using SPLOOP buffer]. There are no plans to
+# realize fully unrolled variant though...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6");		# arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
+($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
+($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
+($XPA,$XPB) = ("A5","B5");			# X circular buffer
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9));	# zaps $NUM
+
+$code=<<___;
+	.text
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	MV,SWAP2
+	.asg	MV,SWAP4
+	.endif
+
+	.global	_sha1_block_data_order
+_sha1_block_data_order:
+	.asmfunc stack_usage(64)
+	MV	$NUM,A0			; reassign $NUM
+||	MVK	-64,B0
+  [!A0]	BNOP	RA			; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--[16]		; save frame pointer and alloca(64)
+|| [A0]	MV	SP,FP
+   [A0]	LDW	*${CTX}[0],$A		; load A-E...
+|| [A0]	AND	B0,SP,SP		; align stack at 64 bytes
+   [A0]	LDW	*${CTX}[1],$B
+|| [A0]	SUBAW	SP,2,SP			; reserve two words above buffer
+   [A0]	LDW	*${CTX}[2],$C
+|| [A0]	MVK	0x00404,B0
+   [A0]	LDW	*${CTX}[3],$D
+|| [A0]	MVKH	0x50000,B0		; 0x050404, 64 bytes for $XP[AB]
+   [A0]	LDW	*${CTX}[4],$E
+|| [A0]	MVC	B0,AMR			; setup circular addressing
+	LDNW	*${INP}++,$TX1		; pre-fetch input
+	NOP	1
+
+loop?:
+	MVK	0x00007999,$K
+||	ADDAW	SP,2,$XPA
+||	SUB	A0,1,A0
+||	MVK	13,B0
+	MVKH	0x5a820000,$K		; K_00_19
+||	ADDAW	SP,2,$XPB
+||	MV	$A,$Actx
+||	MV	$B,$Bctx
+;;==================================================
+	SPLOOPD	5			; BODY_00_13
+||	MV	$C,$Cctx
+||	MV	$D,$Dctx
+||	MV	$E,$Ectx
+||	MVC	B0,ILC
+
+	ROTL	$A,5,$Arot
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+||	LDNW	*${INP}++,$TX1
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX3		; byte swap
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+
+	ADD	$TX3,$T,$A		; A=T+Xi
+||	STW	$TX3,*${XPB}++
+	SPKERNEL
+;;==================================================
+	ROTL	$A,5,$Arot		; BODY_14
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+||	LDNW	*${INP}++,$TX1
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX2		; byte swap
+||	LDW	*${XPA}++,$X0		; fetches from X ring buffer are
+||	LDW	*${XPB}[4],$X2		; 2 iterations ahead
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+;;==================================================
+	ROTL	$A,5,$Arot		; BODY_15
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+||	SWAP2	$TX1,$TX2
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	SWAP4	$TX2,$TX2		; byte swap
+||	XOR	$X0,$X2,$TX0		; Xupdate XORs are 1 iteration ahead
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+||	MVK	3,B0
+;;==================================================
+	SPLOOPD	5			; BODY_16_19
+||	MVC	B0,ILC
+
+	ROTL	$A,5,$Arot
+||	AND	$C,$B,$F
+||	ANDN	$D,$B,$F0
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$F0,$F,$F		; F_00_19(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_00_19(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+	SPKERNEL
+
+	MVK	0xffffeba1,$K
+||	MVK	19,B0
+	MVKH	0x6ed90000,$K		; K_20_39
+___
+sub BODY_20_39 {
+$code.=<<___;
+;;==================================================
+	SPLOOPD	5			; BODY_20_39
+||	MVC	B0,ILC
+
+	ROTL	$A,5,$Arot
+||	XOR	$B,$C,$F
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$D,$F,$F		; F_20_39(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++		; last one is redundant
+||	XOR	$TX0,$TX1,$TX1
+	SPKERNEL
+___
+$code.=<<___ if (!shift);
+	MVK	0xffffbcdc,$K
+	MVKH	0x8f1b0000,$K		; K_40_59
+___
+}	&BODY_20_39();
+$code.=<<___;
+;;==================================================
+	SPLOOPD	5			; BODY_40_59
+||	MVC	B0,ILC
+||	AND	$B,$C,$F
+||	AND	$B,$D,$F0
+
+	ROTL	$A,5,$Arot
+||	XOR	$F0,$F,$F
+||	AND	$C,$D,$F0
+||	ADD	$K,$E,$T		; T=E+K
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+	XOR	$F0,$F,$F		; F_40_59(B,C,D)
+||	MV	$D,$E			; E=D
+||	MV	$C,$D			; D=C
+
+	ADD	$F,$T,$T		; T+=F_40_59(B,C,D)
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+||	XOR	$X0,$X2,$TX0
+||	LDW	*${XPA}++,$X0
+||	LDW	*${XPB}[4],$X2
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	MV	$A,$B			; B=A
+||	XOR	$X8,$X13,$TX1
+||	LDW	*${XPA}[7],$X8
+||	MV	$TX3,$X13		; ||	LDW	*${XPB}[15],$X13
+||	MV	$TX2,$TX3
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+||	STW	$TX2,*${XPB}++
+||	XOR	$TX0,$TX1,$TX1
+||	AND	$B,$C,$F
+||	AND	$B,$D,$F0
+	SPKERNEL
+
+	MVK	0xffffc1d6,$K
+||	MVK	18,B0
+	MVKH	0xca620000,$K		; K_60_79
+___
+	&BODY_20_39(-1);		# BODY_60_78
+$code.=<<___;
+;;==================================================
+   [A0]	B	loop?
+||	ROTL	$A,5,$Arot		; BODY_79
+||	XOR	$B,$C,$F
+||	ROTL	$TX1,1,$TX2		; Xupdate output
+
+   [A0]	LDNW	*${INP}++,$TX1		; pre-fetch input
+||	ADD	$K,$E,$T		; T=E+K
+||	XOR	$D,$F,$F		; F_20_39(B,C,D)
+
+	ADD	$F,$T,$T		; T+=F_20_39(B,C,D)
+||	ADD	$Ectx,$D,$E		; E=D,E+=Ectx
+||	ADD	$Dctx,$C,$D		; D=C,D+=Dctx
+||	ROTL	$B,30,$C		; C=ROL(B,30)
+
+	ADD	$Arot,$T,$T		; T+=ROL(A,5)
+||	ADD	$Bctx,$A,$B		; B=A,B+=Bctx
+
+	ADD	$TX2,$T,$A		; A=T+Xi
+
+	ADD	$Actx,$A,$A		; A+=Actx
+||	ADD	$Cctx,$C,$C		; C+=Cctx
+;; end of loop?
+
+	BNOP	RA			; return
+||	MV	FP,SP			; restore stack pointer
+||	LDW	*FP[0],FP		; restore frame pointer
+	STW	$A,*${CTX}[0]		; emit A-E...
+||	MVK	0,B0
+	STW	$B,*${CTX}[1]
+||	MVC	B0,AMR			; clear AMR
+	STW	$C,*${CTX}[2]
+	STW	$D,*${CTX}[3]
+	STW	$E,*${CTX}[4]
+	.endasmfunc
+
+	.sect	.const
+	.cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
--- a/crypto/sha/asm/sha1-mips.pl
+++ b/crypto/sha/asm/sha1-mips.pl
@ -42,7 +42,7 @@
 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
 #
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64

 if ($flavour =~ /64|n32/i) {
 	$PTR_ADD="dadd";	# incidentally works even on n32
@ -64,7 +64,7 @@ if ($flavour =~ /64|n32/i) {
 #
 ######################################################################

-$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;

 for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);   }
 open STDOUT,">$output";
--- a/crypto/sha/asm/sha1-ppc.pl
+++ b/crypto/sha/asm/sha1-ppc.pl
@ -210,7 +210,7 @@ Lunaligned:
 	srwi.	$t1,$t1,6	; t1/=64
 	beq	Lcross_page
 	$UCMP	$num,$t1
-	ble-	Laligned	; didn't cross the page boundary
+	ble	Laligned	; didn't cross the page boundary
 	mtctr	$t1
 	subfc	$num,$t1,$num
 	bl	Lsha1_block_private
@ -238,7 +238,7 @@ Lmemcpy:
 	bl	Lsha1_block_private
 	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
 	addic.	$num,$num,-1
-	bne-	Lunaligned
+	bne	Lunaligned

 Ldone:
 	$POP	r0,`$FRAME+$LRSAVE`($sp)
@ -312,7 +312,7 @@ $code.=<<___;
 	stw	r20,16($ctx)
 	mr	$E,r20
 	addi	$inp,$inp,`16*4`
-	bdnz-	Lsha1_block_private
+	bdnz	Lsha1_block_private
 	blr
 	.long	0
 	.byte	0,12,0x14,0,0,0,0,0
--- a/crypto/sha/asm/sha256-armv4.pl
+++ b/crypto/sha/asm/sha256-armv4.pl
@ -23,8 +23,20 @@
 # Profiler-assisted and platform-specific optimization resulted in 16%
 # improvement on Cortex A8 core and ~17 cycles per processed byte.

-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}

 $ctx="r0";	$t0="r0";
 $inp="r1";	$t3="r1";
--- a/crypto/sha/asm/sha256-c64x.pl
+++ b/crypto/sha/asm/sha256-c64x.pl
@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256 for C64x.
+#
+# November 2016
+#
+# Performance is just below 10 cycles per processed byte, which is
+# almost 40% faster than compiler-generated code. Unroll is unlikely
+# to give more than ~8% improvement...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
+ $K256="A3";
+
+($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
+	=map("A$_",(16..31));
+($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
+	=map("B$_",(16..31));
+
+($Xia,$Xib)=("A5","B5");			# circular/ring buffer
+ $CTXB=$t2e;
+
+($Xn,$X0,$K)=("B7","B8","B9");
+($Maj,$Ch)=($T2,"B6");
+
+$code.=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.nocmp
+	.asg	sha256_block_data_order,_sha256_block_data_order
+	.endif
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	SWAP2,MV
+	.asg	SWAP4,MV
+	.endif
+
+	.global	_sha256_block_data_order
+_sha256_block_data_order:
+__sha256_block:
+	.asmfunc stack_usage(64)
+	MV	$NUM,A0				; reassign $NUM
+||	MVK	-64,B0
+  [!A0]	BNOP	RA				; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--[16]			; save frame pointer and alloca(64)
+|| [A0]	MV	SP,FP
+   [A0]	ADDKPC	_sha256_block_data_order,B2
+|| [A0]	AND	B0,SP,SP			; align stack at 64 bytes
+	.if	__TI_EABI__
+   [A0]	MVK	0x00404,B1
+|| [A0]	MVKL	\$PCR_OFFSET(K256,__sha256_block),$K256
+   [A0]	MVKH	0x50000,B1
+|| [A0]	MVKH	\$PCR_OFFSET(K256,__sha256_block),$K256
+	.else
+   [A0]	MVK	0x00404,B1
+|| [A0]	MVKL	(K256-__sha256_block),$K256
+   [A0]	MVKH	0x50000,B1
+|| [A0]	MVKH	(K256-__sha256_block),$K256
+	.endif
+   [A0]	MVC	B1,AMR				; setup circular addressing
+|| [A0]	MV	SP,$Xia
+   [A0]	MV	SP,$Xib
+|| [A0]	ADD	B2,$K256,$K256
+|| [A0]	MV	$CTXA,$CTXB
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	LDW	*${CTXA}[0],$A			; load ctx
+||	LDW	*${CTXB}[4],$E
+	LDW	*${CTXA}[1],$B
+||	LDW	*${CTXB}[5],$F
+	LDW	*${CTXA}[2],$C
+||	LDW	*${CTXB}[6],$G
+	LDW	*${CTXA}[3],$D
+||	LDW	*${CTXB}[7],$H
+
+	LDNW	*$INP++,$Xn			; pre-fetch input
+	LDW	*$K256++,$K			; pre-fetch K256[0]
+	NOP
+	ADDAW	$Xia,9,$Xia
+outerloop?:
+	SUB	A0,1,A0
+||	MV	$A,$Actx
+||	MV	$E,$Ectx
+||	MVD	$B,$Bctx
+||	MVD	$F,$Fctx
+	MV	$C,$Cctx
+||	MV	$G,$Gctx
+||	MVD	$D,$Dctx
+||	MVD	$H,$Hctx
+||	SWAP4	$Xn,$X0
+
+	MVK	14,B0				; loop counter
+||	SWAP2	$X0,$X0
+
+loop_00_14?:					; BODY_00_14
+	LDNW	*$INP++,$Xn
+||	ROTL	$A,30,$S0
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$K,$H,$T1			; T1 = h + K256[i]
+|| [B0]	BDEC	loop_00_14?,B0
+	ADD	$X0,$T1,$T1			; T1 += X[i];
+||	STW	$X0,*$Xib++
+||	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	ROTL	$G,0,$H				; h = g
+||	MV	$F,$G				; g = f
+||	MV	$X0,$X14
+||	SWAP4	$Xn,$X0
+	SWAP2	$X0,$X0
+||	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+	MV	$B,$C				; c = b
+||	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+;;===== branch to loop00_14? is taken here
+
+	ROTL	$A,30,$S0			; BODY_15
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+||	LDW	*${Xib}[1],$Xn			; modulo-scheduled
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+||	LDW	*${Xib}[2],$X1			; modulo-scheduled
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$K,$H,$T1			; T1 = h + K256[i]
+	ADD	$X0,$T1,$T1			; T1 += X[i];
+||	STW	$X0,*$Xib++
+||	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	ROTL	$G,0,$H				; h = g
+||	MV	$F,$G				; g = f
+||	MV	$X0,$X15
+	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+||	MV	$Xn,$X0				; modulo-scheduled
+||	LDW	*$Xia,$X9			; modulo-scheduled
+||	ROTL	$X1,25,$t0e			; modulo-scheduled
+||	ROTL	$X14,15,$t0a			; modulo-scheduled
+	SHRU	$X1,3,$s0			; modulo-scheduled
+||	SHRU	$X14,10,$s1			; modulo-scheduled
+||	ROTL	$B,0,$C				; c = b
+||	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+
+	MVK	47,B1				; loop counter
+||	ROTL	$X1,14,$t1e			; modulo-scheduled
+||	ROTL	$X14,13,$t1a			; modulo-scheduled
+
+loop_16_63?:					; BODY_16_63
+	XOR	$t0e,$s0,$s0
+||	XOR	$t0a,$s1,$s1
+||	MV	$X15,$X14
+||	MV	$X1,$Xn
+	XOR	$t1e,$s0,$s0			; sigma0(X[i+1])
+||	XOR	$t1a,$s1,$s1			; sigma1(X[i+14])
+||	LDW	*${Xib}[2],$X1			; module-scheduled
+	ROTL	$A,30,$S0
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+||	ADD	$X9,$X0,$X0			; X[i] += X[i+9]
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+||	ADD	$s0,$X0,$X0			; X[i] += sigma1(X[i+1])
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$H,$K,$T1			; T1 = h + K256[i]
+||	ADD	$s1,$X0,$X0			; X[i] += sigma1(X[i+14])
+|| [B1]	BDEC	loop_16_63?,B1
+	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+||	ADD	$X0,$T1,$T1			; T1 += X[i]
+||	STW	$X0,*$Xib++
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+||	MV	$X0,$X15
+||	ROTL	$G,0,$H				; h = g
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	MV	$F,$G				; g = f
+||	MV	$Xn,$X0				; modulo-scheduled
+||	LDW	*++$Xia,$X9			; modulo-scheduled
+||	ROTL	$X1,25,$t0e			; module-scheduled
+||	ROTL	$X14,15,$t0a			; modulo-scheduled
+	ROTL	$X1,14,$t1e			; modulo-scheduled
+||	ROTL	$X14,13,$t1a			; modulo-scheduled
+||	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+||	MV	$B,$C				; c = b
+	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+||	SHRU	$X1,3,$s0			; modulo-scheduled
+||	SHRU	$X14,10,$s1			; modulo-scheduled
+;;===== branch to loop16_63? is taken here
+
+   [A0]	B	outerloop?
+|| [A0]	LDNW	*$INP++,$Xn			; pre-fetch input
+|| [A0]	ADDK	-260,$K256			; rewind K256
+||	ADD	$Actx,$A,$A			; accumulate ctx
+||	ADD	$Ectx,$E,$E
+||	ADD	$Bctx,$B,$B
+	ADD	$Fctx,$F,$F
+||	ADD	$Cctx,$C,$C
+||	ADD	$Gctx,$G,$G
+||	ADD	$Dctx,$D,$D
+||	ADD	$Hctx,$H,$H
+|| [A0]	LDW	*$K256++,$K			; pre-fetch K256[0]
+
+  [!A0]	BNOP	RA
+||[!A0]	MV	$CTXA,$CTXB
+  [!A0]	MV	FP,SP				; restore stack pointer
+||[!A0]	LDW	*FP[0],FP			; restore frame pointer
+  [!A0]	STW	$A,*${CTXA}[0]  		; save ctx
+||[!A0]	STW	$E,*${CTXB}[4]
+||[!A0]	MVK	0,B0
+  [!A0]	STW	$B,*${CTXA}[1]
+||[!A0]	STW	$F,*${CTXB}[5]
+||[!A0]	MVC	B0,AMR				; clear AMR
+	STW	$C,*${CTXA}[2]
+||	STW	$G,*${CTXB}[6]
+	STW	$D,*${CTXA}[3]
+||	STW	$H,*${CTXB}[7]
+	.endasmfunc
+
+	.if	__TI_EABI__
+	.sect	".text:sha_asm.const"
+	.else
+	.sect	".const:sha_asm"
+	.endif
+	.align	128
+K256:
+	.uword	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.uword	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.uword	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.uword	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.uword	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.uword	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.uword	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.uword	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.uword	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.uword	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.uword	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.uword	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.uword	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.uword	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.uword	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.uword	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+	.cstring "SHA256 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+
+___
+
+print $code;
--- a/crypto/sha/asm/sha256-c64xplus.pl
+++ b/crypto/sha/asm/sha256-c64xplus.pl
@ -0,0 +1,292 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256 for C64x+.
+#
+# January 2012
+#
+# Performance is just below 10 cycles per processed byte, which is
+# almost 40% faster than compiler-generated code. Unroll is unlikely
+# to give more than ~8% improvement...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
+ $K256="A3";
+
+($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
+	=map("A$_",(16..31));
+($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
+	=map("B$_",(16..31));
+
+($Xia,$Xib)=("A5","B5");			# circular/ring buffer
+ $CTXB=$t2e;
+
+($Xn,$X0,$K)=("B7","B8","B9");
+($Maj,$Ch)=($T2,"B6");
+
+$code.=<<___;
+	.text
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	SWAP2,MV
+	.asg	SWAP4,MV
+	.endif
+
+	.global	_sha256_block_data_order
+_sha256_block_data_order:
+	.asmfunc stack_usage(64)
+	MV	$NUM,A0				; reassign $NUM
+||	MVK	-64,B0
+  [!A0]	BNOP	RA				; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--[16]			; save frame pointer and alloca(64)
+|| [A0]	MV	SP,FP
+   [A0]	ADDKPC	_sha256_block_data_order,B2
+|| [A0]	AND	B0,SP,SP			; align stack at 64 bytes
+   [A0]	MVK	0x00404,B1
+|| [A0]	MVKL	(K256-_sha256_block_data_order),$K256
+   [A0]	MVKH	0x50000,B1
+|| [A0]	MVKH	(K256-_sha256_block_data_order),$K256
+   [A0]	MVC	B1,AMR				; setup circular addressing
+|| [A0]	MV	SP,$Xia
+   [A0]	MV	SP,$Xib
+|| [A0]	ADD	B2,$K256,$K256
+|| [A0]	MV	$CTXA,$CTXB
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	LDW	*${CTXA}[0],$A			; load ctx
+||	LDW	*${CTXB}[4],$E
+	LDW	*${CTXA}[1],$B
+||	LDW	*${CTXB}[5],$F
+	LDW	*${CTXA}[2],$C
+||	LDW	*${CTXB}[6],$G
+	LDW	*${CTXA}[3],$D
+||	LDW	*${CTXB}[7],$H
+
+	LDNW	*$INP++,$Xn			; pre-fetch input
+	LDW	*$K256++,$K			; pre-fetch K256[0]
+	MVK	14,B0				; loop counters
+	MVK	47,B1
+||	ADDAW	$Xia,9,$Xia
+outerloop?:
+	SUB	A0,1,A0
+||	MV	$A,$Actx
+||	MV	$E,$Ectx
+||	MVD	$B,$Bctx
+||	MVD	$F,$Fctx
+	MV	$C,$Cctx
+||	MV	$G,$Gctx
+||	MVD	$D,$Dctx
+||	MVD	$H,$Hctx
+||	SWAP4	$Xn,$X0
+
+	SPLOOPD	8				; BODY_00_14
+||	MVC	B0,ILC
+||	SWAP2	$X0,$X0
+
+	LDNW	*$INP++,$Xn
+||	ROTL	$A,30,$S0
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$K,$H,$T1			; T1 = h + K256[i]
+	ADD	$X0,$T1,$T1			; T1 += X[i];
+||	STW	$X0,*$Xib++
+||	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	ROTL	$G,0,$H				; h = g
+||	MV	$F,$G				; g = f
+||	MV	$X0,$X14
+||	SWAP4	$Xn,$X0
+	SWAP2	$X0,$X0
+||	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+	MV	$B,$C				; c = b
+||	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+	SPKERNEL
+
+	ROTL	$A,30,$S0			; BODY_15
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+||	LDW	*${Xib}[1],$Xn			; modulo-scheduled
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+||	LDW	*${Xib}[2],$X1			; modulo-scheduled
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$K,$H,$T1			; T1 = h + K256[i]
+	ADD	$X0,$T1,$T1			; T1 += X[i];
+||	STW	$X0,*$Xib++
+||	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	ROTL	$G,0,$H				; h = g
+||	MV	$F,$G				; g = f
+||	MV	$X0,$X15
+	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+||	MV	$Xn,$X0				; modulo-scheduled
+||	LDW	*$Xia,$X9			; modulo-scheduled
+||	ROTL	$X1,25,$t0e			; modulo-scheduled
+||	ROTL	$X14,15,$t0a			; modulo-scheduled
+	SHRU	$X1,3,$s0			; modulo-scheduled
+||	SHRU	$X14,10,$s1			; modulo-scheduled
+||	ROTL	$B,0,$C				; c = b
+||	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+
+	SPLOOPD	10				; BODY_16_63
+||	MVC	B1,ILC
+||	ROTL	$X1,14,$t1e			; modulo-scheduled
+||	ROTL	$X14,13,$t1a			; modulo-scheduled
+
+	XOR	$t0e,$s0,$s0
+||	XOR	$t0a,$s1,$s1
+||	MV	$X15,$X14
+||	MV	$X1,$Xn
+	XOR	$t1e,$s0,$s0			; sigma0(X[i+1])
+||	XOR	$t1a,$s1,$s1			; sigma1(X[i+14])
+||	LDW	*${Xib}[2],$X1			; module-scheduled
+	ROTL	$A,30,$S0
+||	OR	$A,$B,$Maj
+||	AND	$A,$B,$t2a
+||	ROTL	$E,26,$S1
+||	AND	$F,$E,$Ch
+||	ANDN	$G,$E,$t2e
+||	ADD	$X9,$X0,$X0			; X[i] += X[i+9]
+	ROTL	$A,19,$t0a
+||	AND	$C,$Maj,$Maj
+||	ROTL	$E,21,$t0e
+||	XOR	$t2e,$Ch,$Ch			; Ch(e,f,g) = (e&f)^(~e&g)
+||	ADD	$s0,$X0,$X0			; X[i] += sigma1(X[i+1])
+	ROTL	$A,10,$t1a
+||	OR	$t2a,$Maj,$Maj			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ROTL	$E,7,$t1e
+||	ADD	$H,$K,$T1			; T1 = h + K256[i]
+||	ADD	$s1,$X0,$X0			; X[i] += sigma1(X[i+14])
+	XOR	$t0a,$S0,$S0
+||	XOR	$t0e,$S1,$S1
+||	ADD	$X0,$T1,$T1			; T1 += X[i]
+||	STW	$X0,*$Xib++
+	XOR	$t1a,$S0,$S0			; Sigma0(a)
+||	XOR	$t1e,$S1,$S1			; Sigma1(e)
+||	ADD	$Ch,$T1,$T1			; T1 += Ch(e,f,g)
+||	MV	$X0,$X15
+||	ROTL	$G,0,$H				; h = g
+||	LDW	*$K256++,$K			; pre-fetch K256[i+1]
+	ADD	$S1,$T1,$T1			; T1 += Sigma1(e)
+||	ADD	$S0,$Maj,$T2			; T2 = Sigma0(a) + Maj(a,b,c)
+||	MV	$F,$G				; g = f
+||	MV	$Xn,$X0				; modulo-scheduled
+||	LDW	*++$Xia,$X9			; modulo-scheduled
+||	ROTL	$X1,25,$t0e			; module-scheduled
+||	ROTL	$X14,15,$t0a			; modulo-scheduled
+	ROTL	$X1,14,$t1e			; modulo-scheduled
+||	ROTL	$X14,13,$t1a			; modulo-scheduled
+||	MV	$E,$F				; f = e
+||	ADD	$D,$T1,$E			; e = d + T1
+||	MV	$C,$D				; d = c
+||	MV	$B,$C				; c = b
+	MV	$A,$B				; b = a
+||	ADD	$T1,$T2,$A			; a = T1 + T2
+||	SHRU	$X1,3,$s0			; modulo-scheduled
+||	SHRU	$X14,10,$s1			; modulo-scheduled
+	SPKERNEL
+
+   [A0]	B	outerloop?
+|| [A0]	LDNW	*$INP++,$Xn			; pre-fetch input
+|| [A0]	ADDK	-260,$K256			; rewind K256
+||	ADD	$Actx,$A,$A			; accumulate ctx
+||	ADD	$Ectx,$E,$E
+||	ADD	$Bctx,$B,$B
+	ADD	$Fctx,$F,$F
+||	ADD	$Cctx,$C,$C
+||	ADD	$Gctx,$G,$G
+||	ADD	$Dctx,$D,$D
+||	ADD	$Hctx,$H,$H
+|| [A0]	LDW	*$K256++,$K			; pre-fetch K256[0]
+
+  [!A0]	BNOP	RA
+||[!A0]	MV	$CTXA,$CTXB
+  [!A0]	MV	FP,SP				; restore stack pointer
+||[!A0]	LDW	*FP[0],FP			; restore frame pointer
+  [!A0]	STW	$A,*${CTXA}[0]  		; save ctx
+||[!A0]	STW	$E,*${CTXB}[4]
+||[!A0]	MVK	0,B0
+  [!A0]	STW	$B,*${CTXA}[1]
+||[!A0]	STW	$F,*${CTXB}[5]
+||[!A0]	MVC	B0,AMR				; clear AMR
+	STW	$C,*${CTXA}[2]
+||	STW	$G,*${CTXB}[6]
+	STW	$D,*${CTXA}[3]
+||	STW	$H,*${CTXB}[7]
+	.endasmfunc
+
+	.sect	".const:sha_asm"
+	.align	128
+K256:
+	.uword	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.uword	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.uword	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.uword	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.uword	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.uword	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.uword	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.uword	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.uword	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.uword	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.uword	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.uword	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.uword	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.uword	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.uword	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.uword	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+	.cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+
+___
+
+print $code;
--- a/crypto/sha/asm/sha512-armv4.pl
+++ b/crypto/sha/asm/sha512-armv4.pl
@ -38,8 +38,20 @@ $hi="HI";
 $lo="LO";
 # ====================================================================

-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}

 $ctx="r0";	# parameter block
 $inp="r1";
@ -221,17 +233,21 @@ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
 .LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-sha512_block_data_order
+.word	OPENSSL_armcap_P-.Lsha512_block_data_order
 .skip	32-4

 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
+.Lsha512_block_data_order:
 	sub	r3,pc,#8		@ sha512_block_data_order
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
 #if __ARM_ARCH__>=7
 	ldr	r12,.LOPENSSL_armcap
 	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
 	tst	r12,#1
 	bne	.LNEON
 #endif
--- a/crypto/sha/asm/sha512-armv8.pl
+++ b/crypto/sha/asm/sha512-armv8.pl
@ -0,0 +1,428 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256/512 for ARMv8.
+#
+# Performance in cycles per processed byte and improvement coefficient
+# over code generated with "default" compiler:
+#
+#		SHA256-hw	SHA256(*)	SHA512
+# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+# Cortex-A53	2.38		15.6 (+110%)	10.1 (+190%(***))
+# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+# 
+# (*)	Software SHA256 results are of lesser relevance, presented
+#	mostly for informational purposes.
+# (**)	The result is a trade-off: it's possible to improve it by
+#	10% (or by 1 cycle per round), but at the cost of 20% loss
+#	on Cortex-A53 (or by 4 cycles per round).
+# (***)	Super-impressive coefficients over gcc-generated code are
+#	indication of some compiler "pathology", most notably code
+#	generated with -mgeneral-regs-only is significanty faster
+#	and lags behind assembly only by 50-90%.
+
+$flavour=shift;
+$output=shift;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if ($output =~ /512/) {
+	$BITS=512;
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$reg_t="x";
+} else {
+	$BITS=256;
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$reg_t="w";
+}
+
+$func="sha${BITS}_block_data_order";
+
+($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
+
+@X=map("$reg_t$_",(3..15,0..2));
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
+($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
+
+sub BODY_00_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)&15;
+my ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
+   $T0=@X[$i+3] if ($i<11);
+
+$code.=<<___	if ($i<16);
+#ifndef	__ARMEB__
+	rev	@X[$i],@X[$i]			// $i
+#endif
+___
+$code.=<<___	if ($i<13 && ($i&1));
+	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
+___
+$code.=<<___	if ($i==13);
+	ldp	@X[14],@X[15],[$inp]
+___
+$code.=<<___	if ($i>=14);
+	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
+___
+$code.=<<___	if ($i>0 && $i<16);
+	add	$a,$a,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=11);
+	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
+___
+# While ARMv8 specifies merged rotate-n-logical operation such as
+# 'eor x,y,z,ror#n', it was found to negatively affect performance
+# on Apple A7. The reason seems to be that it requires even 'y' to
+# be available earlier. This means that such merged instruction is
+# not necessarily best choice on critical path... On the other hand
+# Cortex-A5x handles merged instructions much better than disjoint
+# rotate and logical... See (**) footnote above.
+$code.=<<___	if ($i<15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
+	and	$t1,$f,$e
+	bic	$t2,$g,$e
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	add	$d,$d,$h			// d+=h
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	//add	$h,$h,$t1			// h+=Sigma0(a)
+___
+$code.=<<___	if ($i>=15);
+	ror	$t0,$e,#$Sigma1[0]
+	add	$h,$h,$t2			// h+=K[i]
+	ror	$T1,@X[($j+1)&15],#$sigma0[0]
+	and	$t1,$f,$e
+	ror	$T2,@X[($j+14)&15],#$sigma1[0]
+	bic	$t2,$g,$e
+	ror	$T0,$a,#$Sigma0[0]
+	add	$h,$h,@X[$i&15]			// h+=X[i]
+	eor	$t0,$t0,$e,ror#$Sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
+	orr	$t1,$t1,$t2			// Ch(e,f,g)
+	eor	$t2,$a,$b			// a^b, b^c in next round
+	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
+	eor	$T0,$T0,$a,ror#$Sigma0[1]
+	add	$h,$h,$t1			// h+=Ch(e,f,g)
+	and	$t3,$t3,$t2			// (b^c)&=(a^b)
+	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
+	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
+	add	$h,$h,$t0			// h+=Sigma1(e)
+	eor	$t3,$t3,$b			// Maj(a,b,c)
+	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
+	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
+	add	@X[$j],@X[$j],@X[($j+9)&15]
+	add	$d,$d,$h			// d+=h
+	add	$h,$h,$t3			// h+=Maj(a,b,c)
+	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
+	add	@X[$j],@X[$j],$T1
+	add	$h,$h,$t1			// h+=Sigma0(a)
+	add	@X[$j],@X[$j],$T2
+___
+	($t2,$t3)=($t3,$t2);
+}
+
+$code.=<<___;
+#include "arm_arch.h"
+
+.text
+
+.extern	OPENSSL_armcap_P
+.globl	$func
+.type	$func,%function
+.align	6
+$func:
+___
+$code.=<<___	if ($SZ==4);
+	ldr	x16,.LOPENSSL_armcap_P
+	adr	x17,.LOPENSSL_armcap_P
+	add	x16,x16,x17
+	ldr	w16,[x16]
+	tst	w16,#ARMV8_SHA256
+	b.ne	.Lv8_entry
+___
+$code.=<<___;
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*$SZ
+
+	ldp	$A,$B,[$ctx]				// load context
+	ldp	$C,$D,[$ctx,#2*$SZ]
+	ldp	$E,$F,[$ctx,#4*$SZ]
+	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
+	ldp	$G,$H,[$ctx,#6*$SZ]
+	adr	$Ktbl,.LK$BITS
+	stp	$ctx,$num,[x29,#96]
+
+.Loop:
+	ldp	@X[0],@X[1],[$inp],#2*$SZ
+	ldr	$t2,[$Ktbl],#$SZ			// *K++
+	eor	$t3,$B,$C				// magic seed
+	str	$inp,[x29,#112]
+___
+for ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=".Loop_16_xx:\n";
+for (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	cbnz	$t2,.Loop_16_xx
+
+	ldp	$ctx,$num,[x29,#96]
+	ldr	$inp,[x29,#112]
+	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
+
+	ldp	@X[0],@X[1],[$ctx]
+	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
+	add	$inp,$inp,#14*$SZ			// advance input pointer
+	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
+	add	$A,$A,@X[0]
+	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
+	add	$B,$B,@X[1]
+	add	$C,$C,@X[2]
+	add	$D,$D,@X[3]
+	stp	$A,$B,[$ctx]
+	add	$E,$E,@X[4]
+	add	$F,$F,@X[5]
+	stp	$C,$D,[$ctx,#2*$SZ]
+	add	$G,$G,@X[6]
+	add	$H,$H,@X[7]
+	cmp	$inp,$num
+	stp	$E,$F,[$ctx,#4*$SZ]
+	stp	$G,$H,[$ctx,#6*$SZ]
+	b.ne	.Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*$SZ
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	ret
+.size	$func,.-$func
+
+.align	6
+.type	.LK$BITS,%object
+.LK$BITS:
+___
+$code.=<<___ if ($SZ==8);
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+	.quad	0	// terminator
+___
+$code.=<<___ if ($SZ==4);
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+	.long	0	//terminator
+___
+$code.=<<___;
+.size	.LK$BITS,.-.LK$BITS
+.align	3
+.LOPENSSL_armcap_P:
+	.quad	OPENSSL_armcap_P-.
+.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+if ($SZ==4) {
+my $Ktbl="x3";
+
+my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
+my @MSG=map("v$_.16b",(4..7));
+my ($W0,$W1)=("v16.4s","v17.4s");
+my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
+
+$code.=<<___;
+.type	sha256_block_armv8,%function
+.align	6
+sha256_block_armv8:
+.Lv8_entry:
+	stp		x29,x30,[sp,#-16]!
+	add		x29,sp,#0
+
+	ld1.32		{$ABCD,$EFGH},[$ctx]
+	adr		$Ktbl,.LK256
+
+.Loop_hw:
+	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
+	sub		$num,$num,#1
+	ld1.32		{$W0},[$Ktbl],#16
+	rev32		@MSG[0],@MSG[0]
+	rev32		@MSG[1],@MSG[1]
+	rev32		@MSG[2],@MSG[2]
+	rev32		@MSG[3],@MSG[3]
+	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
+	orr		$EFGH_SAVE,$EFGH,$EFGH
+___
+for($i=0;$i<12;$i++) {
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	sha256su0	@MSG[0],@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+	sha256su1	@MSG[0],@MSG[2],@MSG[3]
+___
+	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
+}
+$code.=<<___;
+	ld1.32		{$W1},[$Ktbl],#16
+	add.i32		$W0,$W0,@MSG[0]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	ld1.32		{$W0},[$Ktbl],#16
+	add.i32		$W1,$W1,@MSG[1]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	ld1.32		{$W1},[$Ktbl]
+	add.i32		$W0,$W0,@MSG[2]
+	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W0
+	sha256h2	$EFGH,$abcd,$W0
+
+	add.i32		$W1,$W1,@MSG[3]
+	orr		$abcd,$ABCD,$ABCD
+	sha256h		$ABCD,$EFGH,$W1
+	sha256h2	$EFGH,$abcd,$W1
+
+	add.i32		$ABCD,$ABCD,$ABCD_SAVE
+	add.i32		$EFGH,$EFGH,$EFGH_SAVE
+
+	cbnz		$num,.Loop_hw
+
+	st1.32		{$ABCD,$EFGH},[$ctx]
+
+	ldr		x29,[sp],#16
+	ret
+.size	sha256_block_armv8,.-sha256_block_armv8
+___
+}
+
+$code.=<<___;
+.comm	OPENSSL_armcap_P,4,4
+___
+
+{   my  %opcode = (
+	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
+	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
+
+    sub unsha256 {
+	my ($mnemonic,$arg)=@_;
+
+	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+	&&
+	sprintf ".inst\t0x%08x\t//%s %s",
+			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+			$mnemonic,$arg;
+    }
+}
+
+foreach(split("\n",$code)) {
+
+	s/\`([^\`]*)\`/eval($1)/geo;
+
+	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
+
+	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
+	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
+
+	print $_,"\n";
+}
+
+close STDOUT;
--- a/crypto/sha/asm/sha512-c64x.pl
+++ b/crypto/sha/asm/sha512-c64x.pl
@ -0,0 +1,437 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA512 for C64x.
+#
+# November 2016
+#
+# Performance is ~19 cycles per processed byte. Compared to block
+# transform function from sha512.c compiled with cl6x with -mv6400+
+# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
+# Loop unroll won't make it, this implementation, any faster, because
+# it's effectively dominated by SHRU||SHL pairs and you can't schedule
+# more of them.
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
+ $K512="A3";
+
+($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
+ $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
+($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
+ $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
+
+($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
+($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
+($T1hi,         $T2hi)=         ("A6","A7");
+($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
+($Khi,$Klo)=("A9","A8");
+($MAJhi,$MAJlo)=($T2hi,$T2lo);
+($t1hi,$t1lo)=($Khi,"B2");
+ $CTXB=$t1lo;
+
+($Xihi,$Xilo)=("A5","B5");			# circular/ring buffer
+
+$code.=<<___;
+	.text
+
+	.if	.ASSEMBLER_VERSION<7000000
+	.asg	0,__TI_EABI__
+	.endif
+	.if	__TI_EABI__
+	.nocmp
+	.asg	sha512_block_data_order,_sha512_block_data_order
+	.endif
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	$Khi,KHI
+	.asg	$Klo,KLO
+	.else
+	.asg	$Khi,KLO
+	.asg	$Klo,KHI
+	.endif
+
+	.global	_sha512_block_data_order
+_sha512_block_data_order:
+__sha512_block:
+	.asmfunc stack_usage(40+128)
+	MV	$NUM,A0				; reassign $NUM
+||	MVK	-128,B0
+  [!A0]	BNOP	RA				; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--(40)			; save frame pointer
+|| [A0]	MV	SP,FP
+   [A0]	STDW	B13:B12,*SP[4]
+|| [A0]	MVK	0x00404,B1
+   [A0]	STDW	B11:B10,*SP[3]
+|| [A0]	STDW	A13:A12,*FP[-3]
+|| [A0]	MVKH	0x60000,B1
+   [A0]	STDW	A11:A10,*SP[1]
+|| [A0]	MVC	B1,AMR				; setup circular addressing
+|| [A0]	ADD	B0,SP,SP			; alloca(128)
+	.if	__TI_EABI__
+   [A0]	AND	B0,SP,SP			; align stack at 128 bytes
+|| [A0]	ADDKPC	__sha512_block,B1
+|| [A0]	MVKL	\$PCR_OFFSET(K512,__sha512_block),$K512
+   [A0]	MVKH	\$PCR_OFFSET(K512,__sha512_block),$K512
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	.else
+   [A0]	AND	B0,SP,SP			; align stack at 128 bytes
+|| [A0]	ADDKPC	__sha512_block,B1
+|| [A0]	MVKL	(K512-__sha512_block),$K512
+   [A0]	MVKH	(K512-__sha512_block),$K512
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	.endif
+	ADDAW	SP,3,$Xilo
+	ADD	SP,4*2,$Xihi			; ADDAW	SP,2,$Xihi
+
+||	MV	$CTXA,$CTXB
+	LDW	*${CTXA}[0^.LITTLE_ENDIAN],$Ahi	; load ctx
+||	LDW	*${CTXB}[1^.LITTLE_ENDIAN],$Alo
+||	ADD	B1,$K512,$K512
+	LDW	*${CTXA}[2^.LITTLE_ENDIAN],$Bhi
+||	LDW	*${CTXB}[3^.LITTLE_ENDIAN],$Blo
+	LDW	*${CTXA}[4^.LITTLE_ENDIAN],$Chi
+||	LDW	*${CTXB}[5^.LITTLE_ENDIAN],$Clo
+	LDW	*${CTXA}[6^.LITTLE_ENDIAN],$Dhi
+||	LDW	*${CTXB}[7^.LITTLE_ENDIAN],$Dlo
+	LDW	*${CTXA}[8^.LITTLE_ENDIAN],$Ehi
+||	LDW	*${CTXB}[9^.LITTLE_ENDIAN],$Elo
+	LDW	*${CTXA}[10^.LITTLE_ENDIAN],$Fhi
+||	LDW	*${CTXB}[11^.LITTLE_ENDIAN],$Flo
+	LDW	*${CTXA}[12^.LITTLE_ENDIAN],$Ghi
+||	LDW	*${CTXB}[13^.LITTLE_ENDIAN],$Glo
+	LDW	*${CTXA}[14^.LITTLE_ENDIAN],$Hhi
+||	LDW	*${CTXB}[15^.LITTLE_ENDIAN],$Hlo
+
+	LDNDW	*$INP++,B11:B10			; pre-fetch input
+	LDDW	*$K512++,$Khi:$Klo		; pre-fetch K512[0]
+outerloop?:
+	MVK	15,B0				; loop counters
+||	MVK	64,B1
+||	SUB	A0,1,A0
+	MV	$Ahi,$Actxhi
+||	MV	$Alo,$Actxlo
+||	MV	$Bhi,$Bctxhi
+||	MV	$Blo,$Bctxlo
+||	MV	$Chi,$Cctxhi
+||	MV	$Clo,$Cctxlo
+||	MVD	$Dhi,$Dctxhi
+||	MVD	$Dlo,$Dctxlo
+	MV	$Ehi,$Ectxhi
+||	MV	$Elo,$Ectxlo
+||	MV	$Fhi,$Fctxhi
+||	MV	$Flo,$Fctxlo
+||	MV	$Ghi,$Gctxhi
+||	MV	$Glo,$Gctxlo
+||	MVD	$Hhi,$Hctxhi
+||	MVD	$Hlo,$Hctxlo
+loop0_15?:
+	.if	.BIG_ENDIAN
+	MV	B11,$T1hi
+||	MV	B10,$T1lo
+	.else
+	SWAP4	B10,$T1hi
+||	SWAP4	B11,$T1lo
+	SWAP2	$T1hi,$T1hi
+||	SWAP2	$T1lo,$T1lo
+	.endif
+	STW	$T1hi,*$Xihi++[2]			; original loop16_79?
+||	STW	$T1lo,*$Xilo++[2]			; X[i] = T1
+||	ADD	$Hhi,$T1hi,$T1hi
+||	ADDU	$Hlo,$T1lo,$T1carry:$T1lo		; T1 += h
+||	SHRU	$Ehi,14,$S1hi
+||	SHL	$Ehi,32-14,$S1lo
+loop16_79?:
+	XOR	$Fhi,$Ghi,$CHhi
+||	XOR	$Flo,$Glo,$CHlo
+||	ADD	KHI,$T1hi,$T1hi
+||	ADDU	KLO,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += K512[i]
+||	SHRU	$Elo,14,$t0lo
+||	SHL	$Elo,32-14,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	AND	$Ehi,$CHhi,$CHhi
+||	AND	$Elo,$CHlo,$CHlo
+||	ROTL	$Ghi,0,$Hhi
+||	ROTL	$Glo,0,$Hlo				; h = g
+||	SHRU	$Ehi,18,$t0hi
+||	SHL	$Ehi,32-18,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	XOR	$Ghi,$CHhi,$CHhi
+||	XOR	$Glo,$CHlo,$CHlo			; Ch(e,f,g) = ((f^g)&e)^g
+||	ROTL	$Fhi,0,$Ghi
+||	ROTL	$Flo,0,$Glo				; g = f
+||	SHRU	$Elo,18,$t0lo
+||	SHL	$Elo,32-18,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	OR	$Ahi,$Bhi,$MAJhi
+||	OR	$Alo,$Blo,$MAJlo
+||	ROTL	$Ehi,0,$Fhi
+||	ROTL	$Elo,0,$Flo				; f = e
+||	SHRU	$Ehi,41-32,$t0lo
+||	SHL	$Ehi,64-41,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	AND	$Chi,$MAJhi,$MAJhi
+||	AND	$Clo,$MAJlo,$MAJlo
+||	ROTL	$Dhi,0,$Ehi
+||	ROTL	$Dlo,0,$Elo				; e = d
+||	SHRU	$Elo,41-32,$t0hi
+||	SHL	$Elo,64-41,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo			; Sigma1(e)
+||	AND	$Ahi,$Bhi,$t1hi
+||	AND	$Alo,$Blo,$t1lo
+||	ROTL	$Chi,0,$Dhi
+||	ROTL	$Clo,0,$Dlo				; d = c
+||	SHRU	$Ahi,28,$S0hi
+||	SHL	$Ahi,32-28,$S0lo
+	OR	$t1hi,$MAJhi,$MAJhi
+||	OR	$t1lo,$MAJlo,$MAJlo			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ADD	$CHhi,$T1hi,$T1hi
+||	ADDU	$CHlo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += Ch(e,f,g)
+||	ROTL	$Bhi,0,$Chi
+||	ROTL	$Blo,0,$Clo				; c = b
+||	SHRU	$Alo,28,$t0lo
+||	SHL	$Alo,32-28,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$S1hi,$T1hi,$T1hi
+||	ADDU	$S1lo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += Sigma1(e)
+||	ROTL	$Ahi,0,$Bhi
+||	ROTL	$Alo,0,$Blo				; b = a
+||	SHRU	$Ahi,34-32,$t0lo
+||	SHL	$Ahi,64-34,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$MAJhi,$T1hi,$T2hi
+||	ADDU	$MAJlo,$T1carry:$T1lo,$T2carry:$T2lo	; T2 = T1+Maj(a,b,c)
+||	SHRU	$Alo,34-32,$t0hi
+||	SHL	$Alo,64-34,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$Ehi,$T1hi,$T1hi
+||	ADDU	$Elo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += e
+||	SHRU	$Ahi,39-32,$t0lo
+||	SHL	$Ahi,64-39,$t0hi
+   [B0]	BNOP	loop0_15?
+|| [B0]	LDNDW	*$INP++,B11:B10				; pre-fetch input
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	SHRU	$Alo,39-32,$t0hi
+||	SHL	$Alo,64-39,$t0lo
+||[!B0]	LDW	*${Xihi}[28],$T1hi
+||[!B0]	LDW	*${Xilo}[28],$T1lo			; X[i+14]
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo			; Sigma0(a)
+||	ADD	$T1carry,$T1hi,$Ehi
+||	ROTL	$T1lo,0,$Elo				; e = T1, "ghost" value
+||[!B1]	BNOP	break?
+	ADD	$S0hi,$T2hi,$T2hi
+||	ADDU	$S0lo,$T2carry:$T2lo,$T2carry:$T2lo	; T2 += Sigma0(a)
+|| [B1]	LDDW	*$K512++,$Khi:$Klo			; pre-fetch K512[i]
+	NOP						; avoid cross-path stall
+	ADD	$T2carry,$T2hi,$Ahi
+||	MV	$T2lo,$Alo				; a = T2
+|| [B0]	SUB	B0,1,B0
+;;===== branch to loop00_15? is taken here
+   [B1]	LDW	*${Xihi}[2],$T2hi
+|| [B1]	LDW	*${Xilo}[2],$T2lo			; X[i+1]
+|| [B1]	SHRU	$T1hi,19,$S1hi
+|| [B1]	SHL	$T1hi,32-19,$S1lo
+   [B1]	SHRU	$T1lo,19,$t0lo
+|| [B1]	SHL	$T1lo,32-19,$t0hi
+;;===== branch to break? is taken here
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1hi,61-32,$t0lo
+||	SHL	$T1hi,64-61,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1lo,61-32,$t0hi
+||	SHL	$T1lo,64-61,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1hi,6,$t0hi
+||	SHL	$T1hi,32-6,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1lo,6,$t0lo
+||	LDW	*${Xihi}[18],$T1hi
+||	LDW	*${Xilo}[18],$T1lo			; X[i+9]
+	XOR	$t0lo,$S1lo,$S1lo			; sigma1(Xi[i+14])
+
+||	LDW	*${Xihi}[0],$CHhi
+||	LDW	*${Xilo}[0],$CHlo			; X[i]
+||	SHRU	$T2hi,1,$S0hi
+||	SHL	$T2hi,32-1,$S0lo
+	SHRU	$T2lo,1,$t0lo
+||	SHL	$T2lo,32-1,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	SHRU	$T2hi,8,$t0hi
+||	SHL	$T2hi,32-8,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	SHRU	$T2lo,8,$t0lo
+||	SHL	$T2lo,32-8,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$S1hi,$T1hi,$T1hi
+||	ADDU	$S1lo,$T1lo,$T1carry:$T1lo		; T1 = X[i+9]+sigma1()
+||	SHRU	$T2hi,7,$t0hi
+||	SHL	$T2hi,32-7,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$CHhi,$T1hi,$T1hi
+||	ADDU	$CHlo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += X[i]
+||	SHRU	$T2lo,7,$t0lo
+|| [B1]	BNOP	loop16_79?
+	XOR	$t0lo,$S0lo,$S0lo			; sigma0(Xi[i+1]
+
+	ADD	$S0hi,$T1hi,$T1hi
+||	ADDU	$S0lo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += sigma0()
+|| [B1]	SUB	B1,1,B1
+	NOP						; avoid cross-path stall
+	ADD	$T1carry,$T1hi,$T1hi
+
+   	STW	$T1hi,*$Xihi++[2]			; copied "top" bundle
+||	STW	$T1lo,*$Xilo++[2]			; X[i] = T1
+||	ADD	$Hhi,$T1hi,$T1hi
+||	ADDU	$Hlo,$T1lo,$T1carry:$T1lo		; T1 += h
+||	SHRU	$Ehi,14,$S1hi
+||	SHL	$Ehi,32-14,$S1lo
+;;===== branch to loop16_79? is taken here
+
+break?:
+	ADD	$Ahi,$Actxhi,$Ahi		; accumulate ctx
+||	ADDU	$Alo,$Actxlo,$Actxlo:$Alo
+|| [A0]	LDNDW	*$INP++,B11:B10			; pre-fetch input
+|| [A0]	ADDK	-640,$K512			; rewind pointer to K512
+	ADD	$Bhi,$Bctxhi,$Bhi
+||	ADDU	$Blo,$Bctxlo,$Bctxlo:$Blo
+|| [A0]	LDDW	*$K512++,$Khi:$Klo		; pre-fetch K512[0]
+	ADD	$Chi,$Cctxhi,$Chi
+||	ADDU	$Clo,$Cctxlo,$Cctxlo:$Clo
+||	ADD	$Actxlo,$Ahi,$Ahi
+||[!A0]	MV	$CTXA,$CTXB
+	ADD	$Dhi,$Dctxhi,$Dhi
+||	ADDU	$Dlo,$Dctxlo,$Dctxlo:$Dlo
+||	ADD	$Bctxlo,$Bhi,$Bhi
+||[!A0]	STW	$Ahi,*${CTXA}[0^.LITTLE_ENDIAN]	; save ctx
+||[!A0]	STW	$Alo,*${CTXB}[1^.LITTLE_ENDIAN]
+	ADD	$Ehi,$Ectxhi,$Ehi
+||	ADDU	$Elo,$Ectxlo,$Ectxlo:$Elo
+||	ADD	$Cctxlo,$Chi,$Chi
+|| [A0]	BNOP	outerloop?
+||[!A0]	STW	$Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
+||[!A0]	STW	$Blo,*${CTXB}[3^.LITTLE_ENDIAN]
+	ADD	$Fhi,$Fctxhi,$Fhi
+||	ADDU	$Flo,$Fctxlo,$Fctxlo:$Flo
+||	ADD	$Dctxlo,$Dhi,$Dhi
+||[!A0]	STW	$Chi,*${CTXA}[4^.LITTLE_ENDIAN]
+||[!A0]	STW	$Clo,*${CTXB}[5^.LITTLE_ENDIAN]
+	ADD	$Ghi,$Gctxhi,$Ghi
+||	ADDU	$Glo,$Gctxlo,$Gctxlo:$Glo
+||	ADD	$Ectxlo,$Ehi,$Ehi
+||[!A0]	STW	$Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
+||[!A0]	STW	$Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
+	ADD	$Hhi,$Hctxhi,$Hhi
+||	ADDU	$Hlo,$Hctxlo,$Hctxlo:$Hlo
+||	ADD	$Fctxlo,$Fhi,$Fhi
+||[!A0]	STW	$Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
+||[!A0]	STW	$Elo,*${CTXB}[9^.LITTLE_ENDIAN]
+	ADD	$Gctxlo,$Ghi,$Ghi
+||[!A0]	STW	$Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
+||[!A0]	STW	$Flo,*${CTXB}[11^.LITTLE_ENDIAN]
+	ADD	$Hctxlo,$Hhi,$Hhi
+||[!A0]	STW	$Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
+||[!A0]	STW	$Glo,*${CTXB}[13^.LITTLE_ENDIAN]
+;;===== branch to outerloop? is taken here
+
+	STW	$Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
+||	STW	$Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
+||	MVK	-40,B0
+	ADD	FP,B0,SP			; destroy circular buffer
+||	LDDW	*FP[-4],A11:A10
+	LDDW	*SP[2],A13:A12
+||	LDDW	*FP[-2],B11:B10
+	LDDW	*SP[4],B13:B12
+||	BNOP	RA
+	LDW	*++SP(40),FP			; restore frame pointer
+	MVK	0,B0
+	MVC	B0,AMR				; clear AMR
+	NOP	2				; wait till FP is committed
+	.endasmfunc
+
+	.if	__TI_EABI__
+	.sect	".text:sha_asm.const"
+	.else
+	.sect	".const:sha_asm"
+	.endif
+	.align	128
+K512:
+	.uword	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+	.uword	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+	.uword	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+	.uword	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+	.uword	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+	.uword	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+	.uword	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+	.uword	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+	.uword	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+	.uword	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+	.uword	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+	.uword	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+	.uword	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+	.uword	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+	.uword	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+	.uword	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+	.uword	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+	.uword	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+	.uword	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+	.uword	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+	.uword	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+	.uword	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+	.uword	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+	.uword	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+	.uword	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+	.uword	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+	.uword	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+	.uword	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+	.uword	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+	.uword	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+	.uword	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+	.uword	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+	.uword	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+	.uword	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+	.uword	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+	.uword	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+	.uword	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+	.uword	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+	.uword	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+	.uword	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+	.cstring "SHA512 block transform for C64x, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
--- a/crypto/sha/asm/sha512-c64xplus.pl
+++ b/crypto/sha/asm/sha512-c64xplus.pl
@ -0,0 +1,410 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA512 for C64x+.
+#
+# January 2012
+#
+# Performance is 19 cycles per processed byte. Compared to block
+# transform function from sha512.c compiled with cl6x with -mv6400+
+# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
+# Loop unroll won't make it, this implementation, any faster, because
+# it's effectively dominated by SHRU||SHL pairs and you can't schedule
+# more of them.
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
+ $K512="A3";
+
+($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
+ $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
+($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
+ $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
+
+($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
+($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
+($T1hi,         $T2hi)=         ("A6","A7");
+($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
+($Khi,$Klo)=("A9","A8");
+($MAJhi,$MAJlo)=($T2hi,$T2lo);
+($t1hi,$t1lo)=($Khi,"B2");
+ $CTXB=$t1lo;
+
+($Xihi,$Xilo)=("A5","B5");			# circular/ring buffer
+
+$code.=<<___;
+	.text
+
+	.asg	B3,RA
+	.asg	A15,FP
+	.asg	B15,SP
+
+	.if	.BIG_ENDIAN
+	.asg	$Khi,KHI
+	.asg	$Klo,KLO
+	.else
+	.asg	$Khi,KLO
+	.asg	$Klo,KHI
+	.endif
+
+	.global	_sha512_block_data_order
+_sha512_block_data_order:
+	.asmfunc stack_usage(40+128)
+	MV	$NUM,A0				; reassign $NUM
+||	MVK	-128,B0
+  [!A0]	BNOP	RA				; if ($NUM==0) return;
+|| [A0]	STW	FP,*SP--(40)			; save frame pointer
+|| [A0]	MV	SP,FP
+   [A0]	STDW	B13:B12,*SP[4]
+|| [A0]	MVK	0x00404,B1
+   [A0]	STDW	B11:B10,*SP[3]
+|| [A0]	STDW	A13:A12,*FP[-3]
+|| [A0]	MVKH	0x60000,B1
+   [A0]	STDW	A11:A10,*SP[1]
+|| [A0]	MVC	B1,AMR				; setup circular addressing
+|| [A0]	ADD	B0,SP,SP			; alloca(128)
+   [A0]	AND	B0,SP,SP			; align stack at 128 bytes
+|| [A0]	ADDKPC	_sha512_block_data_order,B1
+|| [A0]	MVKL	(K512-_sha512_block_data_order),$K512
+   [A0]	MVKH	(K512-_sha512_block_data_order),$K512
+|| [A0]	SUBAW	SP,2,SP				; reserve two words above buffer
+	ADDAW	SP,3,$Xilo
+	ADDAW	SP,2,$Xihi
+
+||	MV	$CTXA,$CTXB
+	LDW	*${CTXA}[0^.LITTLE_ENDIAN],$Ahi	; load ctx
+||	LDW	*${CTXB}[1^.LITTLE_ENDIAN],$Alo
+||	ADD	B1,$K512,$K512
+	LDW	*${CTXA}[2^.LITTLE_ENDIAN],$Bhi
+||	LDW	*${CTXB}[3^.LITTLE_ENDIAN],$Blo
+	LDW	*${CTXA}[4^.LITTLE_ENDIAN],$Chi
+||	LDW	*${CTXB}[5^.LITTLE_ENDIAN],$Clo
+	LDW	*${CTXA}[6^.LITTLE_ENDIAN],$Dhi
+||	LDW	*${CTXB}[7^.LITTLE_ENDIAN],$Dlo
+	LDW	*${CTXA}[8^.LITTLE_ENDIAN],$Ehi
+||	LDW	*${CTXB}[9^.LITTLE_ENDIAN],$Elo
+	LDW	*${CTXA}[10^.LITTLE_ENDIAN],$Fhi
+||	LDW	*${CTXB}[11^.LITTLE_ENDIAN],$Flo
+	LDW	*${CTXA}[12^.LITTLE_ENDIAN],$Ghi
+||	LDW	*${CTXB}[13^.LITTLE_ENDIAN],$Glo
+	LDW	*${CTXA}[14^.LITTLE_ENDIAN],$Hhi
+||	LDW	*${CTXB}[15^.LITTLE_ENDIAN],$Hlo
+
+	LDNDW	*$INP++,B11:B10			; pre-fetch input
+	LDDW	*$K512++,$Khi:$Klo		; pre-fetch K512[0]
+outerloop?:
+	MVK	15,B0				; loop counters
+||	MVK	64,B1
+||	SUB	A0,1,A0
+	MV	$Ahi,$Actxhi
+||	MV	$Alo,$Actxlo
+||	MV	$Bhi,$Bctxhi
+||	MV	$Blo,$Bctxlo
+||	MV	$Chi,$Cctxhi
+||	MV	$Clo,$Cctxlo
+||	MVD	$Dhi,$Dctxhi
+||	MVD	$Dlo,$Dctxlo
+	MV	$Ehi,$Ectxhi
+||	MV	$Elo,$Ectxlo
+||	MV	$Fhi,$Fctxhi
+||	MV	$Flo,$Fctxlo
+||	MV	$Ghi,$Gctxhi
+||	MV	$Glo,$Gctxlo
+||	MVD	$Hhi,$Hctxhi
+||	MVD	$Hlo,$Hctxlo
+loop0_15?:
+	.if	.BIG_ENDIAN
+	MV	B11,$T1hi
+||	MV	B10,$T1lo
+	.else
+	SWAP4	B10,$T1hi
+||	SWAP4	B11,$T1lo
+	SWAP2	$T1hi,$T1hi
+||	SWAP2	$T1lo,$T1lo
+	.endif
+loop16_79?:
+	STW	$T1hi,*$Xihi++[2]
+||	STW	$T1lo,*$Xilo++[2]			; X[i] = T1
+||	ADD	$Hhi,$T1hi,$T1hi
+||	ADDU	$Hlo,$T1lo,$T1carry:$T1lo		; T1 += h
+||	SHRU	$Ehi,14,$S1hi
+||	SHL	$Ehi,32-14,$S1lo
+	XOR	$Fhi,$Ghi,$CHhi
+||	XOR	$Flo,$Glo,$CHlo
+||	ADD	KHI,$T1hi,$T1hi
+||	ADDU	KLO,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += K512[i]
+||	SHRU	$Elo,14,$t0lo
+||	SHL	$Elo,32-14,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	AND	$Ehi,$CHhi,$CHhi
+||	AND	$Elo,$CHlo,$CHlo
+||	ROTL	$Ghi,0,$Hhi
+||	ROTL	$Glo,0,$Hlo				; h = g
+||	SHRU	$Ehi,18,$t0hi
+||	SHL	$Ehi,32-18,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	XOR	$Ghi,$CHhi,$CHhi
+||	XOR	$Glo,$CHlo,$CHlo			; Ch(e,f,g) = ((f^g)&e)^g
+||	ROTL	$Fhi,0,$Ghi
+||	ROTL	$Flo,0,$Glo				; g = f
+||	SHRU	$Elo,18,$t0lo
+||	SHL	$Elo,32-18,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	OR	$Ahi,$Bhi,$MAJhi
+||	OR	$Alo,$Blo,$MAJlo
+||	ROTL	$Ehi,0,$Fhi
+||	ROTL	$Elo,0,$Flo				; f = e
+||	SHRU	$Ehi,41-32,$t0lo
+||	SHL	$Ehi,64-41,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	AND	$Chi,$MAJhi,$MAJhi
+||	AND	$Clo,$MAJlo,$MAJlo
+||	ROTL	$Dhi,0,$Ehi
+||	ROTL	$Dlo,0,$Elo				; e = d
+||	SHRU	$Elo,41-32,$t0hi
+||	SHL	$Elo,64-41,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo			; Sigma1(e)
+||	AND	$Ahi,$Bhi,$t1hi
+||	AND	$Alo,$Blo,$t1lo
+||	ROTL	$Chi,0,$Dhi
+||	ROTL	$Clo,0,$Dlo				; d = c
+||	SHRU	$Ahi,28,$S0hi
+||	SHL	$Ahi,32-28,$S0lo
+	OR	$t1hi,$MAJhi,$MAJhi
+||	OR	$t1lo,$MAJlo,$MAJlo			; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||	ADD	$CHhi,$T1hi,$T1hi
+||	ADDU	$CHlo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += Ch(e,f,g)
+||	ROTL	$Bhi,0,$Chi
+||	ROTL	$Blo,0,$Clo				; c = b
+||	SHRU	$Alo,28,$t0lo
+||	SHL	$Alo,32-28,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$S1hi,$T1hi,$T1hi
+||	ADDU	$S1lo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += Sigma1(e)
+||	ROTL	$Ahi,0,$Bhi
+||	ROTL	$Alo,0,$Blo				; b = a
+||	SHRU	$Ahi,34-32,$t0lo
+||	SHL	$Ahi,64-34,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$MAJhi,$T1hi,$T2hi
+||	ADDU	$MAJlo,$T1carry:$T1lo,$T2carry:$T2lo	; T2 = T1+Maj(a,b,c)
+||	SHRU	$Alo,34-32,$t0hi
+||	SHL	$Alo,64-34,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$Ehi,$T1hi,$T1hi
+||	ADDU	$Elo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += e
+|| [B0]	BNOP	loop0_15?
+||	SHRU	$Ahi,39-32,$t0lo
+||	SHL	$Ahi,64-39,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+|| [B0]	LDNDW	*$INP++,B11:B10				; pre-fetch input
+||[!B1]	BNOP	break?
+||	SHRU	$Alo,39-32,$t0hi
+||	SHL	$Alo,64-39,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo			; Sigma0(a)
+||	ADD	$T1carry,$T1hi,$Ehi
+||	MV	$T1lo,$Elo				; e = T1
+||[!B0]	LDW	*${Xihi}[28],$T1hi
+||[!B0]	LDW	*${Xilo}[28],$T1lo			; X[i+14]
+	ADD	$S0hi,$T2hi,$T2hi
+||	ADDU	$S0lo,$T2carry:$T2lo,$T2carry:$T2lo	; T2 += Sigma0(a)
+|| [B1]	LDDW	*$K512++,$Khi:$Klo			; pre-fetch K512[i]
+	NOP						; avoid cross-path stall
+	ADD	$T2carry,$T2hi,$Ahi
+||	MV	$T2lo,$Alo				; a = T2
+|| [B0]	SUB	B0,1,B0
+;;===== branch to loop00_15? is taken here
+	NOP
+;;===== branch to break? is taken here
+	LDW	*${Xihi}[2],$T2hi
+||	LDW	*${Xilo}[2],$T2lo			; X[i+1]
+||	SHRU	$T1hi,19,$S1hi
+||	SHL	$T1hi,32-19,$S1lo
+	SHRU	$T1lo,19,$t0lo
+||	SHL	$T1lo,32-19,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1hi,61-32,$t0lo
+||	SHL	$T1hi,64-61,$t0hi
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1lo,61-32,$t0hi
+||	SHL	$T1lo,64-61,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1hi,6,$t0hi
+||	SHL	$T1hi,32-6,$t0lo
+	XOR	$t0hi,$S1hi,$S1hi
+||	XOR	$t0lo,$S1lo,$S1lo
+||	SHRU	$T1lo,6,$t0lo
+||	LDW	*${Xihi}[18],$T1hi
+||	LDW	*${Xilo}[18],$T1lo			; X[i+9]
+	XOR	$t0lo,$S1lo,$S1lo			; sigma1(Xi[i+14])
+
+||	LDW	*${Xihi}[0],$CHhi
+||	LDW	*${Xilo}[0],$CHlo			; X[i]
+||	SHRU	$T2hi,1,$S0hi
+||	SHL	$T2hi,32-1,$S0lo
+	SHRU	$T2lo,1,$t0lo
+||	SHL	$T2lo,32-1,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	SHRU	$T2hi,8,$t0hi
+||	SHL	$T2hi,32-8,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	SHRU	$T2lo,8,$t0lo
+||	SHL	$T2lo,32-8,$t0hi
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$S1hi,$T1hi,$T1hi
+||	ADDU	$S1lo,$T1lo,$T1carry:$T1lo		; T1 = X[i+9]+sigma1()
+|| [B1]	BNOP	loop16_79?
+||	SHRU	$T2hi,7,$t0hi
+||	SHL	$T2hi,32-7,$t0lo
+	XOR	$t0hi,$S0hi,$S0hi
+||	XOR	$t0lo,$S0lo,$S0lo
+||	ADD	$CHhi,$T1hi,$T1hi
+||	ADDU	$CHlo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += X[i]
+||	SHRU	$T2lo,7,$t0lo
+	XOR	$t0lo,$S0lo,$S0lo			; sigma0(Xi[i+1]
+
+	ADD	$S0hi,$T1hi,$T1hi
+||	ADDU	$S0lo,$T1carry:$T1lo,$T1carry:$T1lo	; T1 += sigma0()
+|| [B1]	SUB	B1,1,B1
+	NOP						; avoid cross-path stall
+	ADD	$T1carry,$T1hi,$T1hi
+;;===== branch to loop16_79? is taken here
+
+break?:
+	ADD	$Ahi,$Actxhi,$Ahi		; accumulate ctx
+||	ADDU	$Alo,$Actxlo,$Actxlo:$Alo
+|| [A0]	LDNDW	*$INP++,B11:B10			; pre-fetch input
+|| [A0]	ADDK	-640,$K512			; rewind pointer to K512
+	ADD	$Bhi,$Bctxhi,$Bhi
+||	ADDU	$Blo,$Bctxlo,$Bctxlo:$Blo
+|| [A0]	LDDW	*$K512++,$Khi:$Klo		; pre-fetch K512[0]
+	ADD	$Chi,$Cctxhi,$Chi
+||	ADDU	$Clo,$Cctxlo,$Cctxlo:$Clo
+||	ADD	$Actxlo,$Ahi,$Ahi
+||[!A0]	MV	$CTXA,$CTXB
+	ADD	$Dhi,$Dctxhi,$Dhi
+||	ADDU	$Dlo,$Dctxlo,$Dctxlo:$Dlo
+||	ADD	$Bctxlo,$Bhi,$Bhi
+||[!A0]	STW	$Ahi,*${CTXA}[0^.LITTLE_ENDIAN]	; save ctx
+||[!A0]	STW	$Alo,*${CTXB}[1^.LITTLE_ENDIAN]
+	ADD	$Ehi,$Ectxhi,$Ehi
+||	ADDU	$Elo,$Ectxlo,$Ectxlo:$Elo
+||	ADD	$Cctxlo,$Chi,$Chi
+|| [A0]	BNOP	outerloop?
+||[!A0]	STW	$Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
+||[!A0]	STW	$Blo,*${CTXB}[3^.LITTLE_ENDIAN]
+	ADD	$Fhi,$Fctxhi,$Fhi
+||	ADDU	$Flo,$Fctxlo,$Fctxlo:$Flo
+||	ADD	$Dctxlo,$Dhi,$Dhi
+||[!A0]	STW	$Chi,*${CTXA}[4^.LITTLE_ENDIAN]
+||[!A0]	STW	$Clo,*${CTXB}[5^.LITTLE_ENDIAN]
+	ADD	$Ghi,$Gctxhi,$Ghi
+||	ADDU	$Glo,$Gctxlo,$Gctxlo:$Glo
+||	ADD	$Ectxlo,$Ehi,$Ehi
+||[!A0]	STW	$Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
+||[!A0]	STW	$Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
+	ADD	$Hhi,$Hctxhi,$Hhi
+||	ADDU	$Hlo,$Hctxlo,$Hctxlo:$Hlo
+||	ADD	$Fctxlo,$Fhi,$Fhi
+||[!A0]	STW	$Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
+||[!A0]	STW	$Elo,*${CTXB}[9^.LITTLE_ENDIAN]
+	ADD	$Gctxlo,$Ghi,$Ghi
+||[!A0]	STW	$Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
+||[!A0]	STW	$Flo,*${CTXB}[11^.LITTLE_ENDIAN]
+	ADD	$Hctxlo,$Hhi,$Hhi
+||[!A0]	STW	$Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
+||[!A0]	STW	$Glo,*${CTXB}[13^.LITTLE_ENDIAN]
+;;===== branch to outerloop? is taken here
+
+	STW	$Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
+||	STW	$Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
+||	MVK	-40,B0
+	ADD	FP,B0,SP			; destroy circular buffer
+||	LDDW	*FP[-4],A11:A10
+	LDDW	*SP[2],A13:A12
+||	LDDW	*FP[-2],B11:B10
+	LDDW	*SP[4],B13:B12
+||	BNOP	RA
+	LDW	*++SP(40),FP			; restore frame pointer
+	MVK	0,B0
+	MVC	B0,AMR				; clear AMR
+	NOP	2				; wait till FP is committed
+	.endasmfunc
+
+	.sect	".const:sha_asm"
+	.align	128
+K512:
+	.uword	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+	.uword	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+	.uword	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+	.uword	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+	.uword	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+	.uword	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+	.uword	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+	.uword	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+	.uword	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+	.uword	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+	.uword	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+	.uword	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+	.uword	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+	.uword	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+	.uword	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+	.uword	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+	.uword	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+	.uword	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+	.uword	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+	.uword	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+	.uword	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+	.uword	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+	.uword	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+	.uword	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+	.uword	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+	.uword	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+	.uword	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+	.uword	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+	.uword	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+	.uword	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+	.uword	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+	.uword	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+	.uword	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+	.uword	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+	.uword	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+	.uword	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+	.uword	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+	.uword	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+	.uword	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+	.uword	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+	.cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+	.align	4
+___
+
+print $code;
+close STDOUT;
--- a/crypto/sha/asm/sha512-mips.pl
+++ b/crypto/sha/asm/sha512-mips.pl
@ -45,7 +45,7 @@
 # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
 # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
 #
-$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64

 if ($flavour =~ /64|n32/i) {
 	$PTR_ADD="dadd";	# incidentally works even on n32
@ -68,7 +68,7 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
 #
 ######################################################################

-$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0;

 for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
 open STDOUT,">$output";
@ -244,7 +244,7 @@ $code.=<<___;

 .text
 .set	noat
-#if !defined(__vxworks) || defined(__pic__)
+#if !defined(__mips_eabi) && (!defined(__vxworks) || defined(__pic__))
 .option	pic2
 #endif

@ -351,7 +351,7 @@ $code.=<<___;
 	$ST	$G,6*$SZ($ctx)
 	$ST	$H,7*$SZ($ctx)

-	bnel	$inp,@X[15],.Loop
+	bne	$inp,@X[15],.Loop
 	$PTR_SUB $Ktbl,`($rounds-16)*$SZ`	# rewind $Ktbl

 	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
--- a/crypto/sha/asm/sha512-ppc.pl
+++ b/crypto/sha/asm/sha512-ppc.pl
@ -64,7 +64,7 @@ die "can't locate ppc-xlate.pl";
 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";

 if ($output =~ /512/) {
-	$func="sha512_block_data_order";
+	$func="sha512_block_ppc";
 	$SZ=8;
 	@Sigma0=(28,34,39);
 	@Sigma1=(14,18,41);
@ -76,7 +76,7 @@ if ($output =~ /512/) {
 	$ROR="rotrdi";
 	$SHR="srdi";
 } else {
-	$func="sha256_block_data_order";
+	$func="sha256_block_ppc";
 	$SZ=4;
 	@Sigma0=( 2,13,22);
 	@Sigma1=( 6,11,25);
@ -243,7 +243,7 @@ Lunaligned:
 	andi.	$t1,$t1,`4096-16*$SZ`	; distance to closest page boundary
 	beq	Lcross_page
 	$UCMP	$num,$t1
-	ble-	Laligned		; didn't cross the page boundary
+	ble	Laligned		; didn't cross the page boundary
 	subfc	$num,$t1,$num
 	add	$t1,$inp,$t1
 	$PUSH	$num,`$FRAME-$SIZE_T*25`($sp)	; save real remaining num
@ -279,7 +279,7 @@ Lmemcpy:
 	$POP	$inp,`$FRAME-$SIZE_T*26`($sp)	; restore real inp
 	$POP	$num,`$FRAME-$SIZE_T*25`($sp)	; restore real num
 	addic.	$num,$num,`-16*$SZ`		; num--
-	bne-	Lunaligned
+	bne	Lunaligned

 Ldone:
 	$POP	r0,`$FRAME+$LRSAVE`($sp)
@ -339,7 +339,7 @@ for(;$i<32;$i++) {
 	unshift(@V,pop(@V));
 }
 $code.=<<___;
-	bdnz-	Lrounds
+	bdnz	Lrounds

 	$POP	$ctx,`$FRAME-$SIZE_T*22`($sp)
 	$POP	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
--- a/crypto/sha/asm/sha512p8-ppc.pl
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@ -0,0 +1,431 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256/512 for PowerISA v2.07.
+#
+# Accurate performance measurements are problematic, because it's
+# always virtualized setup with possibly throttled processor.
+# Relative comparison is therefore more informative. This module is
+# ~60% faster than integer-only sha512-ppc.pl. To anchor to something
+# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
+# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
+# sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
+# result is degree of computational resources' utilization. POWER8 is
+# "massively multi-threaded chip" and difference between single- and
+# maximum multi-process benchmark results tells that utlization is
+# whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
+# for sha1-ppc.pl - 73%. 100% means that multi-process result equals
+# to single-process one, given that all threads end up on the same
+# physical core.
+#
+#######################################################################
+#
+#		SHA256/pre-2.07(*)	SHA512/pre-2.07(*)	SHA1(*)
+# POWER8	9.3   /14.8		5.8   /9.5		7.1
+#
+# (*)	presented for reference/comparison purposes;
+
+$flavour=shift;
+$output =shift;
+
+if ($flavour =~ /64/) {
+	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
+	$STU="stdu";
+	$POP="ld";
+	$PUSH="std";
+} elsif ($flavour =~ /32/) {
+	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
+	$STU="stwu";
+	$POP="lwz";
+	$PUSH="stw";
+} else { die "nonsense $flavour"; }
+
+$LENDIAN=($flavour=~/le/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
+
+if ($output =~ /512/) {
+	$bits=512;
+	$SZ=8;
+	$sz="d";
+	$rounds=80;
+} else {
+	$bits=256;
+	$SZ=4;
+	$sz="w";
+	$rounds=64;
+}
+
+$func="sha${bits}_block_p8";
+$FRAME=8*$SIZE_T;
+
+$sp ="r1";
+$toc="r2";
+$ctx="r3";
+$inp="r4";
+$num="r5";
+$Tbl="r6";
+$idx="r7";
+$lrsave="r8";
+$offload="r11";
+$vrsave="r12";
+($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,10,26..31));
+ $x00=0 if ($flavour =~ /osx/);
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
+@X=map("v$_",(8..23));
+($Ki,$Func,$S0,$S1,$s0,$s1,$lemask)=map("v$_",(24..31));
+
+sub ROUND {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my $j=($i+1)%16;
+
+$code.=<<___		if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
+	lvx_u		@X[$i+1],0,$inp		; load X[i] in advance
+	addi		$inp,$inp,16
+___
+$code.=<<___		if ($i<16 && ($i%(16/$SZ)));
+	vsldoi		@X[$i],@X[$i-1],@X[$i-1],$SZ
+___
+$code.=<<___		if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
+	vperm		@X[$i],@X[$i],@X[$i],$lemask
+___
+$code.=<<___;
+	`"vshasigma${sz}	$s0,@X[($j+1)%16],0,0"		if ($i>=15)`
+	vsel		$Func,$g,$f,$e		; Ch(e,f,g)
+	vshasigma${sz}	$S1,$e,1,15		; Sigma1(e)
+	vaddu${sz}m	$h,$h,@X[$i%16]		; h+=X[i]
+	vshasigma${sz}	$S0,$a,1,0		; Sigma0(a)
+	`"vshasigma${sz}	$s1,@X[($j+14)%16],0,15"	if ($i>=15)`
+	vaddu${sz}m	$h,$h,$Func		; h+=Ch(e,f,g)
+	vxor		$Func,$a,$b
+	`"vaddu${sz}m		@X[$j],@X[$j],@X[($j+9)%16]"	if ($i>=15)`
+	vaddu${sz}m	$h,$h,$S1		; h+=Sigma1(e)
+	vsel		$Func,$b,$c,$Func	; Maj(a,b,c)
+	vaddu${sz}m	$g,$g,$Ki		; future h+=K[i]
+	vaddu${sz}m	$d,$d,$h		; d+=h
+	vaddu${sz}m	$S0,$S0,$Func		; Sigma0(a)+Maj(a,b,c)
+	`"vaddu${sz}m		@X[$j],@X[$j],$s0"		if ($i>=15)`
+	lvx		$Ki,$idx,$Tbl		; load next K[i]
+	addi		$idx,$idx,16
+	vaddu${sz}m	$h,$h,$S0		; h+=Sigma0(a)+Maj(a,b,c)
+	`"vaddu${sz}m		@X[$j],@X[$j],$s1"		if ($i>=15)`
+___
+}
+
+$code=<<___;
+.machine	"any"
+.text
+
+.globl	$func
+.align	6
+$func:
+	$STU		$sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+	mflr		$lrsave
+	li		r10,`$FRAME+8*16+15`
+	li		r11,`$FRAME+8*16+31`
+	stvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	mfspr		$vrsave,256
+	stvx		v21,r11,$sp
+	addi		r11,r11,32
+	stvx		v22,r10,$sp
+	addi		r10,r10,32
+	stvx		v23,r11,$sp
+	addi		r11,r11,32
+	stvx		v24,r10,$sp
+	addi		r10,r10,32
+	stvx		v25,r11,$sp
+	addi		r11,r11,32
+	stvx		v26,r10,$sp
+	addi		r10,r10,32
+	stvx		v27,r11,$sp
+	addi		r11,r11,32
+	stvx		v28,r10,$sp
+	addi		r10,r10,32
+	stvx		v29,r11,$sp
+	addi		r11,r11,32
+	stvx		v30,r10,$sp
+	stvx		v31,r11,$sp
+	li		r11,-1
+	stw		$vrsave,`$FRAME+21*16-4`($sp)	# save vrsave
+	li		$x10,0x10
+	$PUSH		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	li		$x20,0x20
+	$PUSH		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	li		$x30,0x30
+	$PUSH		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	li		$x40,0x40
+	$PUSH		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	li		$x50,0x50
+	$PUSH		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	li		$x60,0x60
+	$PUSH		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	li		$x70,0x70
+	$PUSH		$lrsave,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
+	mtspr		256,r11
+
+	bl		LPICmeup
+	addi		$offload,$sp,$FRAME+15
+___
+$code.=<<___		if ($LENDIAN);
+	li		$idx,8
+	lvsl		$lemask,0,$idx
+	vspltisb	$Ki,0x0f
+	vxor		$lemask,$lemask,$Ki
+___
+$code.=<<___		if ($SZ==4);
+	lvx_4w		$A,$x00,$ctx
+	lvx_4w		$E,$x10,$ctx
+	vsldoi		$B,$A,$A,4		# unpack
+	vsldoi		$C,$A,$A,8
+	vsldoi		$D,$A,$A,12
+	vsldoi		$F,$E,$E,4
+	vsldoi		$G,$E,$E,8
+	vsldoi		$H,$E,$E,12
+___
+$code.=<<___		if ($SZ==8);
+	lvx_u		$A,$x00,$ctx
+	lvx_u		$C,$x10,$ctx
+	lvx_u		$E,$x20,$ctx
+	vsldoi		$B,$A,$A,8		# unpack
+	lvx_u		$G,$x30,$ctx
+	vsldoi		$D,$C,$C,8
+	vsldoi		$F,$E,$E,8
+	vsldoi		$H,$G,$G,8
+___
+$code.=<<___;
+	li		r0,`($rounds-16)/16`	# inner loop counter
+	b		Loop
+.align	5
+Loop:
+	lvx		$Ki,$x00,$Tbl
+	li		$idx,16
+	lvx_u		@X[0],0,$inp
+	addi		$inp,$inp,16
+	stvx		$A,$x00,$offload	# offload $A-$H
+	stvx		$B,$x10,$offload
+	stvx		$C,$x20,$offload
+	stvx		$D,$x30,$offload
+	stvx		$E,$x40,$offload
+	stvx		$F,$x50,$offload
+	stvx		$G,$x60,$offload
+	stvx		$H,$x70,$offload
+	vaddu${sz}m	$H,$H,$Ki		# h+K[i]
+	lvx		$Ki,$idx,$Tbl
+	addi		$idx,$idx,16
+___
+for ($i=0;$i<16;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mtctr		r0
+	b		L16_xx
+.align	5
+L16_xx:
+___
+for (;$i<32;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bdnz		L16_xx
+
+	lvx		@X[2],$x00,$offload
+	subic.		$num,$num,1
+	lvx		@X[3],$x10,$offload
+	vaddu${sz}m	$A,$A,@X[2]
+	lvx		@X[4],$x20,$offload
+	vaddu${sz}m	$B,$B,@X[3]
+	lvx		@X[5],$x30,$offload
+	vaddu${sz}m	$C,$C,@X[4]
+	lvx		@X[6],$x40,$offload
+	vaddu${sz}m	$D,$D,@X[5]
+	lvx		@X[7],$x50,$offload
+	vaddu${sz}m	$E,$E,@X[6]
+	lvx		@X[8],$x60,$offload
+	vaddu${sz}m	$F,$F,@X[7]
+	lvx		@X[9],$x70,$offload
+	vaddu${sz}m	$G,$G,@X[8]
+	vaddu${sz}m	$H,$H,@X[9]
+	bne		Loop
+___
+$code.=<<___		if ($SZ==4);
+	lvx		@X[0],$idx,$Tbl
+	addi		$idx,$idx,16
+	vperm		$A,$A,$B,$Ki		# pack the answer
+	lvx		@X[1],$idx,$Tbl
+	vperm		$E,$E,$F,$Ki
+	vperm		$A,$A,$C,@X[0]
+	vperm		$E,$E,$G,@X[0]
+	vperm		$A,$A,$D,@X[1]
+	vperm		$E,$E,$H,@X[1]
+	stvx_4w		$A,$x00,$ctx
+	stvx_4w		$E,$x10,$ctx
+___
+$code.=<<___		if ($SZ==8);
+	vperm		$A,$A,$B,$Ki		# pack the answer
+	vperm		$C,$C,$D,$Ki
+	vperm		$E,$E,$F,$Ki
+	vperm		$G,$G,$H,$Ki
+	stvx_u		$A,$x00,$ctx
+	stvx_u		$C,$x10,$ctx
+	stvx_u		$E,$x20,$ctx
+	stvx_u		$G,$x30,$ctx
+___
+$code.=<<___;
+	li		r10,`$FRAME+8*16+15`
+	mtlr		$lrsave
+	li		r11,`$FRAME+8*16+31`
+	mtspr		256,$vrsave
+	lvx		v20,r10,$sp		# ABI says so
+	addi		r10,r10,32
+	lvx		v21,r11,$sp
+	addi		r11,r11,32
+	lvx		v22,r10,$sp
+	addi		r10,r10,32
+	lvx		v23,r11,$sp
+	addi		r11,r11,32
+	lvx		v24,r10,$sp
+	addi		r10,r10,32
+	lvx		v25,r11,$sp
+	addi		r11,r11,32
+	lvx		v26,r10,$sp
+	addi		r10,r10,32
+	lvx		v27,r11,$sp
+	addi		r11,r11,32
+	lvx		v28,r10,$sp
+	addi		r10,r10,32
+	lvx		v29,r11,$sp
+	addi		r11,r11,32
+	lvx		v30,r10,$sp
+	lvx		v31,r11,$sp
+	$POP		r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+	$POP		r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+	$POP		r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+	$POP		r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+	$POP		r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+	$POP		r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+	addi		$sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+	blr
+	.long		0
+	.byte		0,12,4,1,0x80,6,3,0
+	.long		0
+.size	$func,.-$func
+___
+
+# Ugly hack here, because PPC assembler syntax seem to vary too
+# much from platforms to platform...
+$code.=<<___;
+.align	6
+LPICmeup:
+	mflr	r0
+	bcl	20,31,\$+4
+	mflr	$Tbl	; vvvvvv "distance" between . and 1st data entry
+	addi	$Tbl,$Tbl,`64-8`
+	mtlr	r0
+	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`64-9*4`
+___
+
+if ($SZ==8) {
+    local *table = sub {
+	foreach(@_) { $code.=".quad	$_,$_\n"; }
+    };
+    table(
+	"0x428a2f98d728ae22","0x7137449123ef65cd",
+	"0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
+	"0x3956c25bf348b538","0x59f111f1b605d019",
+	"0x923f82a4af194f9b","0xab1c5ed5da6d8118",
+	"0xd807aa98a3030242","0x12835b0145706fbe",
+	"0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
+	"0x72be5d74f27b896f","0x80deb1fe3b1696b1",
+	"0x9bdc06a725c71235","0xc19bf174cf692694",
+	"0xe49b69c19ef14ad2","0xefbe4786384f25e3",
+	"0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
+	"0x2de92c6f592b0275","0x4a7484aa6ea6e483",
+	"0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
+	"0x983e5152ee66dfab","0xa831c66d2db43210",
+	"0xb00327c898fb213f","0xbf597fc7beef0ee4",
+	"0xc6e00bf33da88fc2","0xd5a79147930aa725",
+	"0x06ca6351e003826f","0x142929670a0e6e70",
+	"0x27b70a8546d22ffc","0x2e1b21385c26c926",
+	"0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
+	"0x650a73548baf63de","0x766a0abb3c77b2a8",
+	"0x81c2c92e47edaee6","0x92722c851482353b",
+	"0xa2bfe8a14cf10364","0xa81a664bbc423001",
+	"0xc24b8b70d0f89791","0xc76c51a30654be30",
+	"0xd192e819d6ef5218","0xd69906245565a910",
+	"0xf40e35855771202a","0x106aa07032bbd1b8",
+	"0x19a4c116b8d2d0c8","0x1e376c085141ab53",
+	"0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
+	"0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
+	"0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
+	"0x748f82ee5defb2fc","0x78a5636f43172f60",
+	"0x84c87814a1f0ab72","0x8cc702081a6439ec",
+	"0x90befffa23631e28","0xa4506cebde82bde9",
+	"0xbef9a3f7b2c67915","0xc67178f2e372532b",
+	"0xca273eceea26619c","0xd186b8c721c0c207",
+	"0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
+	"0x06f067aa72176fba","0x0a637dc5a2c898a6",
+	"0x113f9804bef90dae","0x1b710b35131c471b",
+	"0x28db77f523047d84","0x32caab7b40c72493",
+	"0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
+	"0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
+	"0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
+$code.=<<___	if (!$LENDIAN);
+.quad	0x0001020304050607,0x1011121314151617
+___
+$code.=<<___	if ($LENDIAN);	# quad-swapped
+.quad	0x1011121314151617,0x0001020304050607
+___
+} else {
+    local *table = sub {
+	foreach(@_) { $code.=".long	$_,$_,$_,$_\n"; }
+    };
+    table(
+	"0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
+	"0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
+	"0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
+	"0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
+	"0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
+	"0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
+	"0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
+	"0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
+	"0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
+	"0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
+	"0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
+	"0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
+	"0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
+	"0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
+	"0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
+	"0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
+$code.=<<___	if (!$LENDIAN);
+.long	0x00010203,0x10111213,0x10111213,0x10111213
+.long	0x00010203,0x04050607,0x10111213,0x10111213
+.long	0x00010203,0x04050607,0x08090a0b,0x10111213
+___
+$code.=<<___	if ($LENDIAN);	# word-swapped
+.long	0x10111213,0x10111213,0x10111213,0x00010203
+.long	0x10111213,0x10111213,0x04050607,0x00010203
+.long	0x10111213,0x08090a0b,0x04050607,0x00010203
+___
+}
+$code.=<<___;
+.asciz	"SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
--- a/crypto/uid.c
+++ b/crypto/uid.c
@ -65,7 +65,7 @@ int OPENSSL_issetugid(void)
 	return issetugid();
 	}

-#elif defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VXWORKS) || defined(OPENSSL_SYS_NETWARE)
+#elif defined(OPENSSL_SYS_WIN32) || defined(OPENSSL_SYS_VXWORKS) || defined(OPENSSL_SYS_NETWARE) || defined(_TMS320C6X)

 int OPENSSL_issetugid(void)
 	{
--- a/crypto/x86cpuid.pl
+++ b/crypto/x86cpuid.pl
@ -119,10 +119,8 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&mov	("esi","edx");
 	&or	("ebp","ecx");		# merge AMD XOP flag

-	&bt	("ecx",26);		# check XSAVE bit
-	&jnc	(&label("done"));
 	&bt	("ecx",27);		# check OSXSAVE bit
-	&jnc	(&label("clear_xmm"));
+	&jnc	(&label("clear_avx"));
 	&xor	("ecx","ecx");
 	&data_byte(0x0f,0x01,0xd0);	# xgetbv
 	&and	("eax",6);
--- a/doc/crypto/SSLeay_version.pod
+++ b/doc/crypto/SSLeay_version.pod
@ -1,74 +0,0 @@
-=pod
-
-=head1 NAME
-
-SSLeay_version - retrieve version/build information about OpenSSL library
-
-=head1 SYNOPSIS
-
- #include <openssl/crypto.h>
-
- const char *SSLeay_version(int type);
-
-=head1 DESCRIPTION
-
-SSLeay_version() returns a pointer to a constant string describing the
-version of the OpenSSL library or giving information about the library
-build.
-
-The following B<type> values are supported:
-
-=over 4
-
-=item SSLEAY_VERSION
-
-The version of the OpenSSL library including the release date.
-
-=item SSLEAY_CFLAGS
-
-The compiler flags set for the compilation process in the form
-"compiler: ..."  if available or "compiler: information not available"
-otherwise.
-
-=item SSLEAY_BUILT_ON
-
-The date of the build process in the form "built on: ..." if available
-or "built on: date not available" otherwise.
-
-=item SSLEAY_PLATFORM
-
-The "Configure" target of the library build in the form "platform: ..."
-if available or "platform: information not available" otherwise.
-
-=item SSLEAY_DIR
-
-The "OPENSSLDIR" setting of the library build in the form "OPENSSLDIR: "...""
-if available or "OPENSSLDIR: N/A" otherwise.
-
-=back
-
-=head1 RETURN VALUES
-
-The following return values can occur:
-
-=over 4
-
-=item "not available"
-
-An invalid value for B<type> was given.
-
-=item Pointer to constant string
-
-Textual description.
-
-=back
-
-=head1 SEE ALSO
-
-L<crypto(3)|crypto(3)>
-
-=head1 HISTORY
-
-B<SSLEAY_DIR> was added in OpenSSL 0.9.7.
-
-=cut
--- a/e_os.h
+++ b/e_os.h
@ -668,7 +668,7 @@ extern char *sys_errlist[]; extern int sys_nerr;
 #if defined(OPENSSL_SYS_WINDOWS)
 #  define strcasecmp _stricmp
 #  define strncasecmp _strnicmp
-#elif defined(OPENSSL_SYS_VMS)
+#elif defined(OPENSSL_SYS_VMS) || defined(OPENSSL_SYS_DSPBIOS)
 /* VMS below version 7.0 doesn't have strcasecmp() */
 #  include "o_str.h"
 #  define strcasecmp OPENSSL_strcasecmp
--- a/fips/aes/fips_aesavs.c
+++ b/fips/aes/fips_aesavs.c
@ -99,7 +99,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
    {
    const EVP_CIPHER *cipher = NULL;

-    if (strcasecmp(amode, "CBC") == 0)
+    if (fips_strcasecmp(amode, "CBC") == 0)
 	{
 	switch (akeysz)
 		{
@ -117,7 +117,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
 		}

 	}
-    else if (strcasecmp(amode, "ECB") == 0)
+    else if (fips_strcasecmp(amode, "ECB") == 0)
 	{
 	switch (akeysz)
 		{
@ -134,7 +134,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
 		break;
 		}
 	}
-    else if (strcasecmp(amode, "CFB128") == 0)
+    else if (fips_strcasecmp(amode, "CFB128") == 0)
 	{
 	switch (akeysz)
 		{
@ -169,7 +169,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
 		break;
 		}
 	}
-    else if(!strcasecmp(amode,"CFB1"))
+    else if(!fips_strcasecmp(amode,"CFB1"))
 	{
 	switch (akeysz)
 		{
@ -186,7 +186,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
 		break;
 		}
 	}
-    else if(!strcasecmp(amode,"CFB8"))
+    else if(!fips_strcasecmp(amode,"CFB8"))
 	{
 	switch (akeysz)
 		{
@ -215,7 +215,7 @@ static int AESTest(EVP_CIPHER_CTX *ctx,
 	}
    if (FIPS_cipherinit(ctx, cipher, aKey, iVec, dir) <= 0)
 	return 0;
-    if(!strcasecmp(amode,"CFB1"))
+    if(!fips_strcasecmp(amode,"CFB1"))
 	M_EVP_CIPHER_CTX_set_flags(ctx, EVP_CIPH_FLAG_LENGTH_BITS);
    if (dir)
 		FIPS_cipher(ctx, ciphertext, plaintext, len);
@ -535,7 +535,7 @@ static int do_mct(char *amode,
 		}
 	    }
 	}
-    
+    FIPS_cipher_ctx_cleanup(&ctx);
    return ret;
    }

@ -554,7 +554,7 @@ static int proc_file(char *rqfile, char *rspfile)
    FILE *afp = NULL, *rfp = NULL;
    char ibuf[2048];
    char tbuf[2048];
-    int ilen, len, ret = 0;
+    int len;
    char algo[8] = "";
    char amode[8] = "";
    char atest[8] = "";
@ -605,7 +605,6 @@ static int proc_file(char *rqfile, char *rspfile)
    while (!err && (fgets(ibuf, sizeof(ibuf), afp)) != NULL)
 	{
 	tidy_line(tbuf, ibuf);
-	ilen = strlen(ibuf);
 	/*      printf("step=%d ibuf=%s",step,ibuf); */
 	switch (step)
 	    {
@ -636,10 +635,8 @@ static int proc_file(char *rqfile, char *rspfile)
 		char *xp, *pp = ibuf+2;
 		int n;
 		if (akeysz)
-		    { /* insert current time & date */
-		    time_t rtim = time(0);
-		    fputs("# ", rfp);
-		    copy_line(ctime(&rtim), rfp);
+		    {
+		    copy_line(ibuf, rfp);
 		    }
 		else
 		    {
@ -780,11 +777,11 @@ static int proc_file(char *rqfile, char *rspfile)
 		    if(do_mct(amode, akeysz, aKey, iVec, 
 			      dir, (unsigned char*)plaintext, len, 
 			      rfp) < 0)
-			EXIT(1);
+			err = 1;
 		    }
 		else
 		    {
-		    ret = AESTest(&ctx, amode, akeysz, aKey, iVec, 
+		    AESTest(&ctx, amode, akeysz, aKey, iVec, 
 				  dir,  /* 0 = decrypt, 1 = encrypt */
 				  plaintext, ciphertext, len);
 		    OutputValue("CIPHERTEXT",ciphertext,len,rfp,
@ -822,7 +819,7 @@ static int proc_file(char *rqfile, char *rspfile)
 		    }
 		else
 		    {
-		    ret = AESTest(&ctx, amode, akeysz, aKey, iVec, 
+		    AESTest(&ctx, amode, akeysz, aKey, iVec, 
 				  dir,  /* 0 = decrypt, 1 = encrypt */
 				  plaintext, ciphertext, len);
 		    OutputValue("PLAINTEXT",(unsigned char *)plaintext,len,rfp,
@ -850,6 +847,7 @@ static int proc_file(char *rqfile, char *rspfile)
 	fclose(rfp);
    if (afp)
 	fclose(afp);
+    FIPS_cipher_ctx_cleanup(&ctx);
    return err;
    }

@ -862,23 +860,26 @@ static int proc_file(char *rqfile, char *rspfile)
    aes_test -d xxxxx.xxx
  The default is: -d req.txt
 --------------------------------------------------*/
+#ifdef FIPS_ALGVS
+int fips_aesavs_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
    {
    char *rqlist = "req.txt", *rspfile = NULL;
    FILE *fp = NULL;
    char fn[250] = "", rfn[256] = "";
-    int f_opt = 0, d_opt = 1;
+    int d_opt = 1;
    fips_algtest_init();

    if (argc > 1)
 	{
-	if (strcasecmp(argv[1], "-d") == 0)
+	if (fips_strcasecmp(argv[1], "-d") == 0)
 	    {
 	    d_opt = 1;
 	    }
-	else if (strcasecmp(argv[1], "-f") == 0)
+	else if (fips_strcasecmp(argv[1], "-f") == 0)
 	    {
-	    f_opt = 1;
 	    d_opt = 0;
 	    }
 	else
@ -915,7 +916,7 @@ int main(int argc, char **argv)
 	    if (proc_file(rfn, rspfile))
 		{
 		printf(">>> Processing failed for: %s <<<\n", rfn);
-		EXIT(1);
+		return 1;
 		}
 	    }
 	fclose(fp);
@ -929,7 +930,6 @@ int main(int argc, char **argv)
 	    printf(">>> Processing failed for: %s <<<\n", fn);
 	    }
 	}
-    EXIT(0);
    return 0;
    }

--- a/fips/aes/fips_gcmtest.c
+++ b/fips/aes/fips_gcmtest.c
@ -75,10 +75,11 @@ int main(int argc, char **argv)

 #include "fips_utl.h"

+static char buf[204800];
+static char lbuf[204800];
+
 static void gcmtest(FILE *in, FILE *out, int encrypt)
 	{
-	char buf[2048];
-	char lbuf[2048];
 	char *keyword, *value;
 	int keylen = -1, ivlen = -1, aadlen = -1, taglen = -1, ptlen = -1;
 	int rv;
@ -261,16 +262,14 @@ static void gcmtest(FILE *in, FILE *out, int encrypt)
 			iv = aad = ct = pt = key = tag = NULL;
 			}
 		}
+	FIPS_cipher_ctx_cleanup(&ctx);	
 	}

 static void xtstest(FILE *in, FILE *out)
 	{
-	char buf[204800];
-	char lbuf[204800];
 	char *keyword, *value;
 	int inlen = 0;
 	int encrypt = 0;
-	int rv;
 	long l;
 	unsigned char *key = NULL, *iv = NULL;
 	unsigned char *inbuf = NULL, *outbuf = NULL;
@ -326,7 +325,7 @@ static void xtstest(FILE *in, FILE *out)
 			{
 			FIPS_cipherinit(&ctx, xts, key, iv, encrypt);
 			outbuf = OPENSSL_malloc(inlen);
-			rv = FIPS_cipher(&ctx, outbuf, inbuf, inlen);
+			FIPS_cipher(&ctx, outbuf, inbuf, inlen);
 			OutputValue(encrypt ? "CT":"PT", outbuf, inlen, out, 0);
 			OPENSSL_free(inbuf);
 			OPENSSL_free(outbuf);
@ -335,12 +334,11 @@ static void xtstest(FILE *in, FILE *out)
 			iv = key = inbuf = outbuf = NULL;
 			}	
 		}
+	FIPS_cipher_ctx_cleanup(&ctx);	
 	}

 static void ccmtest(FILE *in, FILE *out)
 	{
-	char buf[200048];
-	char lbuf[200048];
 	char *keyword, *value;
 	long l;
 	unsigned char *Key = NULL, *Nonce = NULL;
@ -428,6 +426,8 @@ static void ccmtest(FILE *in, FILE *out)
 			}
 		else if (!strcmp(keyword,"Adata"))
 			{
+			if (Adata)
+				OPENSSL_free(Adata);
 			Adata = hex2bin_m(value, &l);
 			if (Alen && l != Alen)
 				{
@ -493,10 +493,16 @@ static void ccmtest(FILE *in, FILE *out)
 		OPENSSL_free(Key);
 	if (Nonce)
 		OPENSSL_free(Nonce);
+	if (Adata)
+		OPENSSL_free(Adata);
 	FIPS_cipher_ctx_cleanup(&ctx);
 	}

-int main(int argc,char **argv)
+#ifdef FIPS_ALGVS
+int fips_gcmtest_main(int argc, char **argv)
+#else
+int main(int argc, char **argv)
+#endif
 	{
 	int encrypt;
 	int xts = 0, ccm = 0;
--- a/fips/cmac/fips_cmactest.c
+++ b/fips/cmac/fips_cmactest.c
@ -92,7 +92,11 @@ static int print_cmac_ver(const EVP_CIPHER *cipher, FILE *out,
 		unsigned char *Mac, int Maclen,
 		int Tlen);

+#ifdef FIPS_ALGVS
+int fips_cmactest_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
 	{
 	FILE *in = NULL, *out = NULL;
 	int mode = 0;		/* 0 => Generate, 1 => Verify */
--- a/fips/des/fips_desmovs.c
+++ b/fips/des/fips_desmovs.c
@ -102,7 +102,7 @@ static int DESTest(EVP_CIPHER_CTX *ctx,
    if (akeysz != 192)
 	{
 	printf("Invalid key size: %d\n", akeysz);
-	EXIT(1);
+	return 0;
 	}

    if (fips_strcasecmp(amode, "CBC") == 0)
@ -120,7 +120,7 @@ static int DESTest(EVP_CIPHER_CTX *ctx,
    else
 	{
 	printf("Unknown mode: %s\n", amode);
-	EXIT(1);
+	return 0;
 	}

    if (FIPS_cipherinit(ctx, cipher, aKey, iVec, dir) <= 0)
@ -155,12 +155,12 @@ static void shiftin(unsigned char *dst,unsigned char *src,int nbits)
    }	

 /*-----------------------------------------------*/
-char *t_tag[2] = {"PLAINTEXT", "CIPHERTEXT"};
-char *t_mode[6] = {"CBC","ECB","OFB","CFB1","CFB8","CFB64"};
-enum Mode {CBC, ECB, OFB, CFB1, CFB8, CFB64};
+char *tdes_t_tag[2] = {"PLAINTEXT", "CIPHERTEXT"};
+char *tdes_t_mode[6] = {"CBC","ECB","OFB","CFB1","CFB8","CFB64"};
+enum tdes_Mode {TCBC, TECB, TOFB, TCFB1, TCFB8, TCFB64};
 int Sizes[6]={64,64,64,1,8,64};

-static void do_mct(char *amode, 
+static int do_tmct(char *amode, 
 	    int akeysz, int numkeys, unsigned char *akey,unsigned char *ivec,
 	    int dir, unsigned char *text, int len,
 	    FILE *rfp)
@ -170,12 +170,12 @@ static void do_mct(char *amode,
    unsigned char text0[8];

    for (imode=0 ; imode < 6 ; ++imode)
-	if(!strcmp(amode,t_mode[imode]))
+	if(!strcmp(amode,tdes_t_mode[imode]))
 	    break;
    if (imode == 6)
 	{ 
 	printf("Unrecognized mode: %s\n", amode);
-	EXIT(1);
+	return 0;
 	}
    for(i=0 ; i < 400 ; ++i)
 	{
@ -196,12 +196,12 @@ static void do_mct(char *amode,
 		OutputValue("",akey+n*8,8,rfp,0);
 		}

-	if(imode != ECB)
+	if(imode != TECB)
 	    OutputValue("IV",ivec,8,rfp,0);
-	OutputValue(t_tag[dir^1],text,len,rfp,imode == CFB1);
+	OutputValue(tdes_t_tag[dir^1],text,len,rfp,imode == TCFB1);
 #if 0
 	/* compensate for endianness */
-	if(imode == CFB1)
+	if(imode == TCFB1)
 	    text[0]<<=7;
 #endif
 	memcpy(text0,text,8);
@ -223,18 +223,18 @@ static void do_mct(char *amode,
 		}
 	    if(j == 9999)
 		{
-		OutputValue(t_tag[dir],text,len,rfp,imode == CFB1);
+		OutputValue(tdes_t_tag[dir],text,len,rfp,imode == TCFB1);
 		/*		memcpy(ivec,text,8); */
 		}
 	    /*	    DebugValue("iv",ctx.iv,8); */
 	    /* accumulate material for the next key */
 	    shiftin(nk,text,Sizes[imode]);
 	    /*	    DebugValue("nk",nk,24);*/
-	    if((dir && (imode == CFB1 || imode == CFB8 || imode == CFB64
-			|| imode == CBC)) || imode == OFB)
+	    if((dir && (imode == TCFB1 || imode == TCFB8
+			|| imode == TCFB64 || imode == TCBC)) || imode == TOFB)
 		memcpy(text,old_iv,8);

-	    if(!dir && (imode == CFB1 || imode == CFB8 || imode == CFB64))
+	    if(!dir && (imode == TCFB1 || imode == TCFB8 || imode == TCFB64))
 		{
 		/* the test specifies using the output of the raw DES operation
 		   which we don't have, so reconstruct it... */
@ -260,18 +260,20 @@ static void do_mct(char *amode,
 	/* pointless exercise - the final text doesn't depend on the
 	   initial text in OFB mode, so who cares what it is? (Who
 	   designed these tests?) */
-	if(imode == OFB)
+	if(imode == TOFB)
 	    for(n=0 ; n < 8 ; ++n)
 		text[n]=text0[n]^old_iv[n];
+	FIPS_cipher_ctx_cleanup(&ctx);
 	}
+    return 1;
    }
    
-static int proc_file(char *rqfile, char *rspfile)
+static int tproc_file(char *rqfile, char *rspfile)
    {
    char afn[256], rfn[256];
    FILE *afp = NULL, *rfp = NULL;
    char ibuf[2048], tbuf[2048];
-    int ilen, len, ret = 0;
+    int len;
    char amode[8] = "";
    char atest[100] = "";
    int akeysz=0;
@ -322,7 +324,6 @@ static int proc_file(char *rqfile, char *rspfile)
    while (!err && (fgets(ibuf, sizeof(ibuf), afp)) != NULL)
 	{
 	tidy_line(tbuf, ibuf);
-	ilen = strlen(ibuf);
 	/*	printf("step=%d ibuf=%s",step,ibuf);*/
 	if(step == 3 && !strcmp(amode,"ECB"))
 	    {
@ -355,10 +356,8 @@ static int proc_file(char *rqfile, char *rspfile)
 		char *xp, *pp = ibuf+2;
 		int n;
 		if(*amode)
-		    { /* insert current time & date */
-		    time_t rtim = time(0);
-		    fputs("# ", rfp);
-		    copy_line(ctime(&rtim), rfp);
+		    {
+		    copy_line(ibuf, rfp);
 		    }
 		else
 		    {
@ -546,12 +545,14 @@ static int proc_file(char *rqfile, char *rspfile)
 		PrintValue("PLAINTEXT", (unsigned char*)plaintext, len);
 		if (strcmp(atest, "Monte") == 0)  /* Monte Carlo Test */
 		    {
-		    do_mct(amode,akeysz,numkeys,aKey,iVec,dir,plaintext,len,rfp);
+		    if (!do_tmct(amode,akeysz,numkeys,aKey,iVec,
+					dir,plaintext,len,rfp))
+			return -1;
 		    }
 		else
 		    {
 		    assert(dir == 1);
-		    ret = DESTest(&ctx, amode, akeysz, aKey, iVec, 
+		    DESTest(&ctx, amode, akeysz, aKey, iVec, 
 				  dir,  /* 0 = decrypt, 1 = encrypt */
 				  ciphertext, plaintext, len);
 		    OutputValue("CIPHERTEXT",ciphertext,len,rfp,
@ -585,13 +586,13 @@ static int proc_file(char *rqfile, char *rspfile)
 		PrintValue("CIPHERTEXT", ciphertext, len);
 		if (strcmp(atest, "Monte") == 0)  /* Monte Carlo Test */
 		    {
-		    do_mct(amode, akeysz, numkeys, aKey, iVec, 
+		    do_tmct(amode, akeysz, numkeys, aKey, iVec, 
 			   dir, ciphertext, len, rfp);
 		    }
 		else
 		    {
 		    assert(dir == 0);
-		    ret = DESTest(&ctx, amode, akeysz, aKey, iVec, 
+		    DESTest(&ctx, amode, akeysz, aKey, iVec, 
 				  dir,  /* 0 = decrypt, 1 = encrypt */
 				  plaintext, ciphertext, len);
 		    OutputValue("PLAINTEXT",(unsigned char *)plaintext,len,rfp,
@ -619,6 +620,7 @@ static int proc_file(char *rqfile, char *rspfile)
 	fclose(rfp);
    if (afp)
 	fclose(afp);
+    FIPS_cipher_ctx_cleanup(&ctx);
    return err;
    }

@ -631,12 +633,16 @@ static int proc_file(char *rqfile, char *rspfile)
    aes_test -d xxxxx.xxx
  The default is: -d req.txt
 --------------------------------------------------*/
+#ifdef FIPS_ALGVS
+int fips_desmovs_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
    {
    char *rqlist = "req.txt", *rspfile = NULL;
    FILE *fp = NULL;
    char fn[250] = "", rfn[256] = "";
-    int f_opt = 0, d_opt = 1;
+    int d_opt = 1;

    fips_algtest_init();
    if (argc > 1)
@ -647,7 +653,6 @@ int main(int argc, char **argv)
 	    }
 	else if (fips_strcasecmp(argv[1], "-f") == 0)
 	    {
-	    f_opt = 1;
 	    d_opt = 0;
 	    }
 	else
@ -680,10 +685,10 @@ int main(int argc, char **argv)
 	    strtok(fn, "\r\n");
 	    strcpy(rfn, fn);
 	    printf("Processing: %s\n", rfn);
-	    if (proc_file(rfn, rspfile))
+	    if (tproc_file(rfn, rspfile))
 		{
 		printf(">>> Processing failed for: %s <<<\n", rfn);
-		EXIT(1);
+		return -1;
 		}
 	    }
 	fclose(fp);
@ -692,12 +697,11 @@ int main(int argc, char **argv)
 	{
 	if (VERBOSE)
 		printf("Processing: %s\n", fn);
-	if (proc_file(fn, rspfile))
+	if (tproc_file(fn, rspfile))
 	    {
 	    printf(">>> Processing failed for: %s <<<\n", fn);
 	    }
 	}
-    EXIT(0);
    return 0;
    }

--- a/fips/dh/fips_dhvs.c
+++ b/fips/dh/fips_dhvs.c
@ -145,8 +145,12 @@ static void output_Zhash(FILE *out, int exout,
 	OPENSSL_cleanse(Z, Zlen);
 	OPENSSL_free(Z);
 	}
-		
-int main(int argc,char **argv)
+
+#ifdef FIPS_ALGVS
+int fips_dhvs_main(int argc, char **argv)
+#else
+int main(int argc, char **argv)
+#endif
 	{
 	char **args = argv + 1;
 	int argn = argc - 1;
@ -275,10 +279,14 @@ int main(int argc,char **argv)
 							rhash, rhashlen);
 			}
 		}
+	if (in && in != stdin)
+		fclose(in);
+	if (out && out != stdout)
+		fclose(out);
 	return 0;
 	parse_error:
 	fprintf(stderr, "Error Parsing request file\n");
-	exit(1);
+	return 1;
 	}

 #endif
--- a/fips/dsa/fips_dsa_sign.c
+++ b/fips/dsa/fips_dsa_sign.c
@ -114,4 +114,28 @@ int FIPS_dsa_verify_digest(DSA *dsa,
 	return dsa->meth->dsa_do_verify(dig,dlen,s,dsa);
 	}

+int FIPS_dsa_verify(DSA *dsa, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash, DSA_SIG *s)
+	{
+	int ret=-1;
+	unsigned char dig[EVP_MAX_MD_SIZE];
+	unsigned int dlen;
+        FIPS_digest(msg, msglen, dig, &dlen, mhash);
+	ret=FIPS_dsa_verify_digest(dsa, dig, dlen, s);
+	OPENSSL_cleanse(dig, dlen);
+	return ret;
+	}
+
+DSA_SIG * FIPS_dsa_sign(DSA *dsa, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash)
+	{
+	DSA_SIG *s;
+	unsigned char dig[EVP_MAX_MD_SIZE];
+	unsigned int dlen;
+        FIPS_digest(msg, msglen, dig, &dlen, mhash);
+	s = FIPS_dsa_sign_digest(dsa, dig, dlen);
+	OPENSSL_cleanse(dig, dlen);
+	return s;
+	}
+
 #endif
--- a/fips/dsa/fips_dsatest.c
+++ b/fips/dsa/fips_dsatest.c
@ -62,8 +62,10 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#ifndef NO_SYS_TYPES_H
 #include <sys/types.h>
 #include <sys/stat.h>
+#endif

 #include "e_os.h"

@ -154,9 +156,7 @@ int main(int argc, char **argv)
 	unsigned char buf[256];
 	unsigned long h;
 	BN_GENCB cb;
-	EVP_MD_CTX mctx;
 	BN_GENCB_set(&cb, dsa_cb, stderr);
-	FIPS_md_ctx_init(&mctx);

    	fips_algtest_init();

@ -210,19 +210,11 @@ int main(int argc, char **argv)
 		}
 	DSA_generate_key(dsa);

-	if (!FIPS_digestinit(&mctx, EVP_sha1()))
-		goto end;
-	if (!FIPS_digestupdate(&mctx, str1, 20))
-		goto end;
-	sig = FIPS_dsa_sign_ctx(dsa, &mctx);
+	sig = FIPS_dsa_sign(dsa, str1, 20, EVP_sha1());
 	if (!sig)
 		goto end;

-	if (!FIPS_digestinit(&mctx, EVP_sha1()))
-		goto end;
-	if (!FIPS_digestupdate(&mctx, str1, 20))
-		goto end;
-	if (FIPS_dsa_verify_ctx(dsa, &mctx, sig) != 1)
+	if (FIPS_dsa_verify(dsa, str1, 20, EVP_sha1(), sig) != 1)
 		goto end;

 	ret = 1;
@ -231,7 +223,6 @@ end:
 	if (sig)
 		FIPS_dsa_sig_free(sig);
 	if (dsa != NULL) FIPS_dsa_free(dsa);
-	FIPS_md_ctx_cleanup(&mctx);
 #if 0
 	CRYPTO_mem_leaks(bio_err);
 #endif
--- a/fips/dsa/fips_dssvs.c
+++ b/fips/dsa/fips_dssvs.c
@ -46,7 +46,8 @@ static int parse_mod(char *line, int *pdsa2, int *pL, int *pN,
 	if (strcmp(keyword, "L"))
 		return 0;
 	*pL = atoi(value);
-	strcpy(line, p + 1);
+	strcpy(lbuf, p + 1);
+        strcpy(line, lbuf);
 	if (pmd)
 		p = strchr(line, ',');
 	else
@ -199,6 +200,7 @@ static void pqg(FILE *in, FILE *out)
 			{
 			fprintf(out, "counter = %d" RESP_EOL RESP_EOL, counter);
 			}
+		FIPS_dsa_free(dsa);
 		}
 	    }
 	else if(!strcmp(keyword,"P"))
@ -519,6 +521,8 @@ static void keyver(FILE *in, FILE *out)
 	    BN_free(g);
 	if (Y2)
 	    BN_free(Y2);
+	if (ctx)
+	    BN_CTX_free(ctx);
    }

 static void keypair(FILE *in, FILE *out)
@ -575,6 +579,8 @@ static void keypair(FILE *in, FILE *out)
 		do_bn_print_name(out, "Y",dsa->pub_key);
 	    	fputs(RESP_EOL, out);
 		}
+	    if (dsa)
+		FIPS_dsa_free(dsa);
 	    }
 	}
    }
@ -627,9 +633,7 @@ static void siggen(FILE *in, FILE *out)
 	    {
 	    unsigned char msg[1024];
 	    int n;
-	    EVP_MD_CTX mctx;
 	    DSA_SIG *sig;
-	    FIPS_md_ctx_init(&mctx);

 	    n=hex2bin(value,msg);

@ -637,19 +641,16 @@ static void siggen(FILE *in, FILE *out)
 		exit(1);
 	    do_bn_print_name(out, "Y",dsa->pub_key);

-	    FIPS_digestinit(&mctx, md);
-	    FIPS_digestupdate(&mctx, msg, n);
-	    sig = FIPS_dsa_sign_ctx(dsa, &mctx);
+	    sig = FIPS_dsa_sign(dsa, msg, n, md);

 	    do_bn_print_name(out, "R",sig->r);
 	    do_bn_print_name(out, "S",sig->s);
 	    fputs(RESP_EOL, out);
 	    FIPS_dsa_sig_free(sig);
-	    FIPS_md_ctx_cleanup(&mctx);
 	    }
 	}
-	if (dsa)
-		FIPS_dsa_free(dsa);
+    if (dsa)
+	FIPS_dsa_free(dsa);
    }

 static void sigver(FILE *in, FILE *out)
@ -687,37 +688,48 @@ static void sigver(FILE *in, FILE *out)
 	    dsa = FIPS_dsa_new();
 	    }
 	else if(!strcmp(keyword,"P"))
-	    dsa->p=hex2bn(value);
+	    do_hex2bn(&dsa->p, value);
 	else if(!strcmp(keyword,"Q"))
-	    dsa->q=hex2bn(value);
+	    do_hex2bn(&dsa->q, value);
 	else if(!strcmp(keyword,"G"))
-	    dsa->g=hex2bn(value);
+	    do_hex2bn(&dsa->g, value);
 	else if(!strcmp(keyword,"Msg"))
 	    n=hex2bin(value,msg);
 	else if(!strcmp(keyword,"Y"))
-	    dsa->pub_key=hex2bn(value);
+	    do_hex2bn(&dsa->pub_key, value);
 	else if(!strcmp(keyword,"R"))
 	    sig->r=hex2bn(value);
 	else if(!strcmp(keyword,"S"))
 	    {
-	    EVP_MD_CTX mctx;
 	    int r;
-	    FIPS_md_ctx_init(&mctx);
 	    sig->s=hex2bn(value);

-	    FIPS_digestinit(&mctx, md);
-	    FIPS_digestupdate(&mctx, msg, n);
 	    no_err = 1;
-	    r = FIPS_dsa_verify_ctx(dsa, &mctx, sig);
+	    r = FIPS_dsa_verify(dsa, msg, n, md, sig);
 	    no_err = 0;
-	    FIPS_md_ctx_cleanup(&mctx);
+	    if (sig->s)
+		{
+		BN_free(sig->s);
+		sig->s = NULL;
+		}
+	    if (sig->r)
+		{
+		BN_free(sig->r);
+		sig->r = NULL;
+		}
 	
 	    fprintf(out, "Result = %c" RESP_EOL RESP_EOL, r == 1 ? 'P' : 'F');
 	    }
 	}
+	if (dsa)
+	    FIPS_dsa_free(dsa);
    }

-int main(int argc,char **argv)
+#ifdef FIPS_ALGVS
+int fips_dssvs_main(int argc, char **argv)
+#else
+int main(int argc, char **argv)
+#endif
    {
    FILE *in, *out;
    if (argc == 4)
--- a/fips/ecdh/fips_ecdh_selftest.c
+++ b/fips/ecdh/fips_ecdh_selftest.c
@ -166,6 +166,7 @@ int FIPS_selftest_ecdh(void)
 			rv = -1;
 			goto err;
 			}
+		EC_KEY_set_flags(ec1, EC_FLAG_COFACTOR_ECDH);

 		if (!EC_KEY_set_public_key_affine_coordinates(ec1, x, y))
 			{
@ -194,6 +195,7 @@ int FIPS_selftest_ecdh(void)
 			rv = -1;
 			goto err;
 			}
+		EC_KEY_set_flags(ec1, EC_FLAG_COFACTOR_ECDH);

 		if (!EC_KEY_set_public_key_affine_coordinates(ec2, x, y))
 			{
--- a/fips/ecdh/fips_ecdhvs.c
+++ b/fips/ecdh/fips_ecdhvs.c
@ -76,7 +76,7 @@ int main(int argc, char **argv)

 #include "fips_utl.h"

-static const EVP_MD *parse_md(char *line)
+static const EVP_MD *eparse_md(char *line)
 	{
 	char *p;
 	if (line[0] != '[' || line[1] != 'E')
@ -261,6 +261,7 @@ static void ec_output_Zhash(FILE *out, int exout, EC_GROUP *group,
 	unsigned char chash[EVP_MAX_MD_SIZE];
 	int Zlen;
 	ec = EC_KEY_new();
+	EC_KEY_set_flags(ec, EC_FLAG_COFACTOR_ECDH);
 	EC_KEY_set_group(ec, group);
 	peerkey = make_peer(group, cx, cy);
 	if (rhash == NULL)
@ -301,7 +302,11 @@ static void ec_output_Zhash(FILE *out, int exout, EC_GROUP *group,
 	EC_POINT_free(peerkey);
 	}
 		
-int main(int argc,char **argv)
+#ifdef FIPS_ALGVS
+int fips_ecdhvs_main(int argc, char **argv)
+#else
+int main(int argc, char **argv)
+#endif
 	{
 	char **args = argv + 1;
 	int argn = argc - 1;
@ -315,6 +320,7 @@ int main(int argc,char **argv)
 	EC_GROUP *group = NULL;
 	char *keyword = NULL, *value = NULL;
 	int do_verify = -1, exout = 0;
+	int rv = 1;

 	int curve_nids[5] = {0,0,0,0,0};
 	int param_set = -1;
@ -408,11 +414,16 @@ int main(int argc,char **argv)
 			if (group)
 				EC_GROUP_free(group);
 			group = EC_GROUP_new_by_curve_name(nid);
+			if (!group)
+				{
+				fprintf(stderr, "ERROR: unsupported curve %s\n", buf + 1);
+				return 1;
+				}
 			}

 		if (strlen(buf) > 6 && !strncmp(buf, "[E", 2))
 			{
-			md = parse_md(buf);
+			md = eparse_md(buf);
 			if (md == NULL)
 				goto parse_error;
 			continue;
@ -459,10 +470,27 @@ int main(int argc,char **argv)
 					md, rhash, rhashlen);
 			}
 		}
-	return 0;
+	rv = 0;
 	parse_error:
-	fprintf(stderr, "Error Parsing request file\n");
-	exit(1);
+	if (id)
+		BN_free(id);
+	if (ix)
+		BN_free(ix);
+	if (iy)
+		BN_free(iy);
+	if (cx)
+		BN_free(cx);
+	if (cy)
+		BN_free(cy);
+	if (group)
+		EC_GROUP_free(group);
+	if (in && in != stdin)
+		fclose(in);
+	if (out && out != stdout)
+		fclose(out);
+	if (rv)
+		fprintf(stderr, "Error Parsing request file\n");
+	return rv;
 	}

 #endif
--- a/fips/ecdsa/fips_ecdsa_sign.c
+++ b/fips/ecdsa/fips_ecdsa_sign.c
@ -87,3 +87,28 @@ int FIPS_ecdsa_verify_ctx(EC_KEY *key, EVP_MD_CTX *ctx, ECDSA_SIG *s)
 	return ret;
 	}

+int FIPS_ecdsa_verify(EC_KEY *key, const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash, ECDSA_SIG *s)
+	{
+	int ret=-1;
+	unsigned char dig[EVP_MAX_MD_SIZE];
+	unsigned int dlen;
+        FIPS_digest(msg, msglen, dig, &dlen, mhash);
+	ret=FIPS_ecdsa_verify_digest(key, dig, dlen, s);
+	OPENSSL_cleanse(dig, dlen);
+	return ret;
+	}
+
+ECDSA_SIG * FIPS_ecdsa_sign(EC_KEY *key,
+			const unsigned char *msg, size_t msglen,
+			const EVP_MD *mhash)
+	{
+	ECDSA_SIG *s;
+	unsigned char dig[EVP_MAX_MD_SIZE];
+	unsigned int dlen;
+        FIPS_digest(msg, msglen, dig, &dlen, mhash);
+	s = FIPS_ecdsa_sign_digest(key, dig, dlen);
+	OPENSSL_cleanse(dig, dlen);
+	return s;
+	}
+
--- a/fips/ecdsa/fips_ecdsavs.c
+++ b/fips/ecdsa/fips_ecdsavs.c
@ -75,7 +75,7 @@ int main(int argc, char **argv)
 #include <openssl/objects.h>


-static int lookup_curve(char *in, char *curve_name, const EVP_MD **pmd)
+static int elookup_curve(char *in, char *curve_name, const EVP_MD **pmd)
 	{
 	char *cname, *p;
 	/* Copy buffer as we will change it */
@ -200,7 +200,7 @@ static int KeyPair(FILE *in, FILE *out)
 		if (*buf == '[' && buf[2] == '-')
 			{
 			if (buf[2] == '-')
-			curve_nid = lookup_curve(buf, lbuf, NULL);
+			curve_nid = elookup_curve(buf, lbuf, NULL);
 			fputs(buf, out);
 			continue;
 			}
@ -260,7 +260,7 @@ static int PKV(FILE *in, FILE *out)
 		fputs(buf, out);
 		if (*buf == '[' && buf[2] == '-')
 			{
-			curve_nid = lookup_curve(buf, lbuf, NULL);
+			curve_nid = elookup_curve(buf, lbuf, NULL);
 			if (curve_nid == NID_undef)
 				return 0;
 				
@ -287,10 +287,13 @@ static int PKV(FILE *in, FILE *out)
 			no_err = 1;
 			rv = EC_KEY_set_public_key_affine_coordinates(key, Qx, Qy);
 			no_err = 0;
+			EC_KEY_free(key);
 			fprintf(out, "Result = %s" RESP_EOL, rv ? "P":"F");
 			}

 		}
+	BN_free(Qx);
+	BN_free(Qy);
 	return 1;
 	}

@ -305,8 +308,6 @@ static int SigGen(FILE *in, FILE *out)
 	EC_KEY *key = NULL;
 	ECDSA_SIG *sig = NULL;
 	const EVP_MD *digest = NULL;
-	EVP_MD_CTX mctx;
-	EVP_MD_CTX_init(&mctx);
 	Qx = BN_new();
 	Qy = BN_new();
 	while(fgets(buf, sizeof buf, in) != NULL)
@ -314,7 +315,7 @@ static int SigGen(FILE *in, FILE *out)
 		fputs(buf, out);
 		if (*buf == '[')
 			{
-			curve_nid = lookup_curve(buf, lbuf, &digest);
+			curve_nid = elookup_curve(buf, lbuf, &digest);
 			if (curve_nid == NID_undef)
 				return 0;
 			}
@ -342,9 +343,7 @@ static int SigGen(FILE *in, FILE *out)
 				return 0;
 				}

-			FIPS_digestinit(&mctx, digest);
-			FIPS_digestupdate(&mctx, msg, mlen);
-	    		sig = FIPS_ecdsa_sign_ctx(key, &mctx);
+	    		sig = FIPS_ecdsa_sign(key, msg, mlen, digest);

 			if (!sig)
 				{
@ -358,7 +357,7 @@ static int SigGen(FILE *in, FILE *out)
 			do_bn_print_name(out, "S", sig->s);

 			EC_KEY_free(key);
-
+			OPENSSL_free(msg);
 			FIPS_ecdsa_sig_free(sig);

 			}
@ -366,7 +365,6 @@ static int SigGen(FILE *in, FILE *out)
 		}
 	BN_free(Qx);
 	BN_free(Qy);
-	FIPS_md_ctx_cleanup(&mctx);
 	return 1;
 	}

@ -381,8 +379,6 @@ static int SigVer(FILE *in, FILE *out)
 	EC_KEY *key = NULL;
 	ECDSA_SIG sg, *sig = &sg;
 	const EVP_MD *digest = NULL;
-	EVP_MD_CTX mctx;
-	EVP_MD_CTX_init(&mctx);
 	sig->r = NULL;
 	sig->s = NULL;
 	while(fgets(buf, sizeof buf, in) != NULL)
@ -390,7 +386,7 @@ static int SigVer(FILE *in, FILE *out)
 		fputs(buf, out);
 		if (*buf == '[')
 			{
-			curve_nid = lookup_curve(buf, lbuf, &digest);
+			curve_nid = elookup_curve(buf, lbuf, &digest);
 			if (curve_nid == NID_undef)
 				return 0;
 			}
@ -447,20 +443,32 @@ static int SigVer(FILE *in, FILE *out)
 				return 0;
 				}

-			FIPS_digestinit(&mctx, digest);
-			FIPS_digestupdate(&mctx, msg, mlen);
 			no_err = 1;
-	    		rv = FIPS_ecdsa_verify_ctx(key, &mctx, sig);
+	    		rv = FIPS_ecdsa_verify(key, msg, mlen, digest, sig);
+			EC_KEY_free(key);
+			if (msg)
+				OPENSSL_free(msg);
 			no_err = 0;

 			fprintf(out, "Result = %s" RESP_EOL, rv ? "P":"F");
 			}

 		}
+	if (sig->r)
+		BN_free(sig->r);
+	if (sig->s)
+		BN_free(sig->s);
+	if (Qx)
+		BN_free(Qx);
+	if (Qy)
+		BN_free(Qy);
 	return 1;
 	}
-
+#ifdef FIPS_ALGVS
+int fips_ecdsavs_main(int argc, char **argv)
+#else
 int main(int argc, char **argv)
+#endif
 	{
 	FILE *in = NULL, *out = NULL;
 	const char *cmd = argv[1];
--- a/fips/fips.c
+++ b/fips/fips.c
@ -81,7 +81,7 @@ static int fips_started = 0;
 static int fips_is_owning_thread(void);
 static int fips_set_owning_thread(void);
 static int fips_clear_owning_thread(void);
-static unsigned char *fips_signature_witness(void);
+static const unsigned char *fips_signature_witness(void);

 #define fips_w_lock()	CRYPTO_w_lock(CRYPTO_LOCK_FIPS)
 #define fips_w_unlock()	CRYPTO_w_unlock(CRYPTO_LOCK_FIPS)
@ -148,7 +148,10 @@ void fips_set_selftest_fail(void)

 extern const void         *FIPS_text_start(),  *FIPS_text_end();
 extern const unsigned char FIPS_rodata_start[], FIPS_rodata_end[];
-unsigned char              FIPS_signature [20] = { 0 };
+#ifdef _TMS320C6X
+const
+#endif
+unsigned char              FIPS_signature [20] = { 0, 0xff };
 __fips_constseg
 static const char          FIPS_hmac_key[]="etaonrishdlcupfm";

@ -413,9 +416,8 @@ int fips_clear_owning_thread(void)
 	return ret;
 	}

-unsigned char *fips_signature_witness(void)
+const unsigned char *fips_signature_witness(void)
 	{
-	extern unsigned char FIPS_signature[];
 	return FIPS_signature;
 	}

--- a/Show more
+++ b/Show more
Author	SHA1	Message	Date
Steve Marquess	d11e6a4410	Add linux-mips32be target for new platform Reviewed-by: Rich Salz <rsalz@openssl.org> Reviewed-by: Andy Polyakov <appro@openssl.org> (Merged from https://github.com/openssl/openssl/pull/3300) (cherry picked from commit `d674242a88`)	2017-08-30 21:45:26 +01:00
Andy Polyakov	e576b67e1a	c6x/* "facelift": - make scripts executable; - "parameterize" platform selection in c6x/do_fips; - add c6x/fips_algvs.mak; - add c6x/run6x.js launcher for more recent CCS versions; Reviewed-by: Rich Salz <rsalz@openssl.org> Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Stephen Henson <steve@openssl.org> (Merged from https://github.com/openssl/openssl/pull/4265) (cherry picked from commit `781280094a`)	2017-08-30 21:27:46 +01:00
Andy Polyakov	7d91d9ea6b	Add some C64x assembly modules [by minor adjustments of C64x+ modules]. AES, SHA256 and SHA512 modules can actually replace corresponding C64x+ modules. This is because C64x+ instructions don't actually provide "killer-argument" advantage in these modules. As for SHA1, even though its performance exactly same, C64x+ module is more responsive to interrupts, i.e. doesn't inhibit them for as long periods as C64x module. Reviewed-by: Rich Salz <rsalz@openssl.org> Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Stephen Henson <steve@openssl.org> (Merged from https://github.com/openssl/openssl/pull/4265) (cherry picked from commit `5526e5791f`)	2017-08-30 21:26:43 +01:00
Steve Marquess	e1a9268d81	Update to match latest CAVS Reviewed-by: Stephen Henson <steve@openssl.org>	2016-11-14 17:00:41 -05:00
Andy Polyakov	2e691f8fa8	FIPS MIPS assembly pack refresh. Backport CVE-2014-3570 bug and postability fixes. Reviewed-by: Rich Salz <rsalz@openssl.org> (cherry picked from commit `10fa6736b1`)	2016-11-14 21:32:05 +01:00
Andy Polyakov	894c04aa05	PowerPC assembly pack: add POWER8 support. Reviewed-by: Dr. Stephen Henson <steve@openssl.org> (cherry picked from commit `4577871ca3`)	2016-06-21 23:44:54 +02:00
Steve Marquess	1278ce48a5	Add target for i686 cross compilation Reviewed-by: Stephen Henson <steve@openssl.org>	2016-02-15 10:26:20 -05:00
Steve Marquess	a0f8d282d7	Add new iOS subdirectory Reviewed-by: Rich Salz <rsalz@openssl.org> Reviewed-by: Stephen Henson <steve@openssl.org>	2015-07-04 15:18:46 -04:00
Andy Polyakov	0f38e9cd78	Add new VxWorks x86 platform Reviewed-by: Rich Salz <rsalz@openssl.org> Reviewed-by: Stephen Henson <steve@openssl.org>	2015-07-04 15:17:45 -04:00
Andy Polyakov	34f39b062c	util/incore update that allows FINGERPRINT_premain-free build. As for complementary fips.c modification. Goal is to ensure that FIPS_signature does not end up in .bss segment, one guaranteed to be zeroed upon program start-up. One would expect explicitly initialized values to end up in .data segment, but it turned out that values explicitly initialized with zeros can end up in .bss. The modification does not affect program flow, because first byte was the only one of significance [to FINGERPRINT_premain]. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:48:08 +02:00
Andy Polyakov	6db8e3bdc9	Add support for Android 5, both 32- and 64-bit cases. Special note about additional -pie flag in android-armv7. The initial reason for adding it is that Android 5 refuses to execute non-PIE binaries. But what about older systems and previously validated platforms? It should be noted that flag is not used when compiling object code, fipscanister.o in this context, only when linking applications, supplementary fips_algvs used during validation procedure. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:47:55 +02:00
Andy Polyakov	50e2a0ea46	Additional vxWorks target. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:47:43 +02:00
Andy Polyakov	3f137e6f1d	fipsalgtest.pl update. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:47:32 +02:00
Andy Polyakov	97fbb0c88c	Configure: add ios-cross target with ARM assembly support. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:47:21 +02:00
Andy Polyakov	5837e90f08	Add iOS-specific armv4cpud.S module. Normally it would be generated from a perlasm module, but doing so would affect existing armv4cpuid.S, which in turn would formally void previously validated platforms. Hense separate module is generated. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:47:10 +02:00
Andy Polyakov	874faf2ffb	Adapt ARM assembly pack for iOS. This is achieved by filtering perlasm output through arm-xlate.pl. But note that it's done only if "flavour" argument is not 'void'. As 'void' is default value for other ARM targets, permasm output is not actually filtered on previously validated platforms. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:46:58 +02:00
Andy Polyakov	0b45df73d2	crypto/modes/modes_lcl.h: let STRICT_ALIGNMENT be on iOS. While ARMv7 in general is capable of unaligned access, not all instructions actually are. And trouble is that compiler doesn't seem to differentiate those capable and incapable of unaligned access. As result exceptions could be observed in xts128.c and ccm128.c modules. Contemporary Linux kernels handle such exceptions by performing requested operation and resuming execution as is if it succeeded. While on iOS exception is fatal. Correct solution is to let STRICT_ALIGNMENT be on all ARM platforms, but doing so is in formal conflict with FIPS maintenance policy. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:46:44 +02:00
Andy Polyakov	2bd3976ed0	Add iOS-specific fips_algvs application. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:46:26 +02:00
Andy Polyakov	c6d109051d	Configure: engage ARMv8 assembly pack in ios64-cross target. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:45:50 +02:00
Andy Polyakov	083ed53def	Engage ARMv8 assembly pack. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:45:07 +02:00
Andy Polyakov	b84813ec01	Add ARMv8 assembly pack. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>	2015-05-13 16:43:25 +02:00
Dr. Stephen Henson	7447e65fcc	support for iOS 7.x/ARMv8 Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Steve Marquess <marquess@openssl.org	2014-10-24 20:41:49 +01:00
Dr. Stephen Henson	60cd2b7206	Update fipsalgtest.pl to cope with changes in file names and format X9.31 tests need to look in files for '9.31' RSA-PSS tests may contain additonal text as well as "salt len: n". We now just look at the start of a filename for a match. Separate ECDSA2 test list. Reorder test to handle new formats: for example PQGVer for DSA2 can be detected based on file format but if this fails revert to PQGVER. For future debugging add a --debug-detect option which prints out more details of the test detection including the first few lines of each request file. Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Steve Marquess <marquess@openssl.org	2014-10-24 20:32:27 +01:00
Dr. Stephen Henson	7fb7844f3b	Remove Dual EC DRBG again... Dual EC DRBG removal now accepted for 2.0.8 onwards.	2014-07-11 19:14:15 +01:00
Dr. Stephen Henson	005563bbce	Add linux-x86_64-cross target.	2014-05-12 18:38:41 +01:00
Dr. Stephen Henson	3b43568d5b	Revert "Remove Dual EC DRBG from FIPS module." Revert Dual EC DRBG removal commit as it was not accepted for 2.0.7 version of the module. This reverts commit `200f249b8c`.	2014-05-12 18:35:30 +01:00
Dr. Stephen Henson	2659a2aa7c	QNX6-armv4 support.	2013-12-16 21:41:07 +00:00
Dr. Stephen Henson	200f249b8c	Remove Dual EC DRBG from FIPS module.	2013-12-16 19:00:58 +00:00
Dr. Stephen Henson	4089bd6080	eCos ARMv4/5 support	2013-12-16 14:29:20 +00:00
Andy Polyakov	baab0cf780	sha1-armv4-large.pl: comply with ABI. (cherry picked from commit `1a9d60d2e3`)	2013-12-16 14:08:56 +00:00
Dr. Stephen Henson	7e98d95f56	Don't require tag before ciphertext in AESGCM mode (cherry picked from commit `964eaad78c`)	2013-12-16 14:08:30 +00:00
Dr. Stephen Henson	b0ee17ad47	Add MIPS support.	2013-12-16 14:07:18 +00:00
Dr. Stephen Henson	4f6c4c1896	Support for WinEC7.	2013-04-10 15:38:24 +01:00
Dr. Stephen Henson	2d183e4c44	Add BSD-ppc85xx support and avoid copying overlapping buffers in fips_dssvs.c	2012-10-14 12:02:53 +00:00
Dr. Stephen Henson	7049d13c5f	update CHANGES	2012-10-04 14:10:12 +00:00
Dr. Stephen Henson	c616200172	Add support for Windows CE and C64+ to FIPS module.	2012-10-04 13:27:11 +00:00
Dr. Stephen Henson	7b899c10cd	file msincore was added on branch OpenSSL-fips-2_0-stable on 2012-10-04 13:27:10 +0000	2012-05-23 17:07:25 +00:00
Dr. Stephen Henson	6b2e340bdd	file hmac_sha1.pl was added on branch OpenSSL-fips-2_0-stable on 2012-10-04 13:27:10 +0000	2012-05-23 17:07:24 +00:00
Dr. Stephen Henson	1de6a62222	revert fipslink.pl unlink retry change	2012-01-18 15:07:11 +00:00
Dr. Stephen Henson	ac381944ac	give a hand old assemblers assembling loop instruction. (original by Andy)	2012-01-18 14:54:20 +00:00
Dr. Stephen Henson	24fadf2a20	typo	2012-01-03 19:43:06 +00:00
Dr. Stephen Henson	409abd2fec	Prepare RC8	2012-01-03 14:23:54 +00:00
Dr. Stephen Henson	421de62232	unlink target and retry to avoid intermittent Win32 failures	2012-01-03 14:22:45 +00:00
Dr. Stephen Henson	c567812fa6	set version to rc8-dev	2011-12-12 14:02:57 +00:00
Dr. Stephen Henson	49dbcbaa4b	Prepare for RC7.	2011-12-12 13:44:05 +00:00
Dr. Stephen Henson	df0884ffb7	Retry rename operation with a slight delay to workaround problems on some versions of Windows.	2011-12-10 18:06:55 +00:00
Dr. Stephen Henson	0e480d5553	use different names for asm temp files to avoid problems on some platforms	2011-12-10 13:29:23 +00:00
Dr. Stephen Henson	7c0d30038f	Close file streams in FIPS algorithm test utilities.	2011-12-08 15:14:38 +00:00
Dr. Stephen Henson	81fc8cd029	prepare for RC6	2011-12-04 21:29:08 +00:00
Dr. Stephen Henson	1d235039d6	For FIPS builds we don't use the normal test files (and in the restricted tarball some don't exist) so set TEST='' to avoid linking to them. This also avoids problems on platforms that copy instead of symlink.	2011-12-04 15:26:26 +00:00
Dr. Stephen Henson	58886fdefc	use BUILD_ONE_CMD for fips specific links otherwise we effectively do 'make links' twice	2011-12-04 15:14:13 +00:00
Dr. Stephen Henson	61c3085d47	Workaround for VxWorks	2011-12-04 15:11:44 +00:00
Dr. Stephen Henson	32b56fe4d2	avoid use of symlinks on Windows: it causes problems on some build environments	2011-12-04 15:04:20 +00:00
Dr. Stephen Henson	efd031abca	Fix x86cpuid so it doesn't fail for some (currently theoretical) virtual machines.	2011-12-03 21:47:48 +00:00
Dr. Stephen Henson	dd4eefdb7b	Change EVP_MAXCHUNK so it doesn't wraparound to 0 on some platforms (IP32L64).	2011-12-03 21:44:01 +00:00
Dr. Stephen Henson	fcd3e8e97b	Prepare for RC6.	2011-12-03 19:51:52 +00:00
Dr. Stephen Henson	476e7e4972	Add tests to ensure ECDSA key gen and DSA signing fails if DRBG entropy source fails.	2011-12-03 19:41:28 +00:00
Dr. Stephen Henson	5e900f3cef	functions aren't unused: revert	2011-12-03 19:19:34 +00:00
Dr. Stephen Henson	75b250a4ed	remove unused functions from module	2011-12-03 18:27:31 +00:00
Dr. Stephen Henson	44cb365eaf	bn/asm/mips.pl: fix typos [from HEAD], original by Andy	2011-12-03 18:26:26 +00:00
Dr. Stephen Henson	9bd2dde42f	prepare for rc5	2011-11-25 16:27:19 +00:00
Dr. Stephen Henson	31bf5f13e0	return error if counter exceeds limit and seed value supplied	2011-11-25 16:03:27 +00:00
Dr. Stephen Henson	7dcdc0d94d	check counter value against 4 * L, not 4096	2011-11-25 15:00:20 +00:00
Dr. Stephen Henson	6ecd287acc	bump version for rc5-dev: hopefully will never be needed...	2011-11-21 00:05:15 +00:00
Dr. Stephen Henson	0e508c12e0	prepare for rc4	2011-11-19 17:04:28 +00:00
Dr. Stephen Henson	f6385248f6	Add flag to support cofactor ECDH	2011-11-19 17:03:44 +00:00
Dr. Stephen Henson	52876c3100	bump version to rc4-dev	2011-11-18 21:59:36 +00:00
Dr. Stephen Henson	c08128acc2	prepare for RC3	2011-11-18 18:50:57 +00:00
Dr. Stephen Henson	901b9b5c36	In EC_KEY_set_public_key_affine_coordinates include explicit check to see passed components do not exceed field order	2011-11-16 13:28:11 +00:00
Dr. Stephen Henson	9eca2399f1	portability fix for some perl versions	2011-11-11 19:01:11 +00:00
Dr. Stephen Henson	3b4fb53221	fclose streams in fips_drbvs.c Produced error message for unsupported curves in fips_ecdhvs.c	2011-11-09 14:23:17 +00:00
Dr. Stephen Henson	7437036cdf	Prepare for RC3 (which may never happen).	2011-11-08 19:08:40 +00:00
Andy Polyakov	ffa76736fa	Platform update from HEAD.	2011-11-08 14:44:55 +00:00
Dr. Stephen Henson	cbed6cfcaa	add fips_algvs.c to restricted tarball	2011-11-07 13:54:30 +00:00
Dr. Stephen Henson	be6dc7e56b	Prepare for RC2	2011-11-07 13:18:12 +00:00
Dr. Stephen Henson	bb25a72881	MacOS and iOS support	2011-11-07 13:16:55 +00:00
Andy Polyakov	1562ce17cb	fipsld, incore: switch to new cross-compile support [from HEAD].	2011-11-07 00:22:59 +00:00
Andy Polyakov	68b2f55b90	e_aes.c: fold aesni_xts_cipher and [most importantly] fix aes_xts_cipher's return value after custom flag was rightly reverted [from HEAD].	2011-11-06 19:49:58 +00:00
Dr. Stephen Henson	79f2c9d1cd	check for unset entropy and nonce callbacks	2011-11-06 13:08:54 +00:00
Dr. Stephen Henson	8a794abd9d	Update fips_test_suite to take multiple command line options and an induced error checking function.	2011-11-06 12:52:27 +00:00
Dr. Stephen Henson	03eae35352	typo	2011-11-05 18:25:16 +00:00
Dr. Stephen Henson	df64f34e84	make post failure simulation reversible in all cases	2011-11-05 18:15:01 +00:00
Dr. Stephen Henson	21a5cb2696	typo: use key for POST callback	2011-11-05 18:11:16 +00:00
Dr. Stephen Henson	01fc2c1598	fix set but unused warnings	2011-11-05 18:04:50 +00:00
Andy Polyakov	04c8062636	armv4cpuid.S, armv4-gf2m.pl: make newest code compilable by older assembler [from HEAD].	2011-11-05 13:57:02 +00:00
Andy Polyakov	6fcc2bbce8	x86cpuid.pl: don't punish "last-year" OSes on "this-year" CPUs [from HEAD]. PR: 2633	2011-11-05 13:56:10 +00:00
Andy Polyakov	f2b0cf9178	ppc.pl: fix bug in bn_mul_comba4 [from HEAD]. PR: 2636 Submitted by: Charles Bryant	2011-11-05 13:55:20 +00:00
Dr. Stephen Henson	485ef852ac	Add single call public key sign and verify functions.	2011-11-05 01:32:52 +00:00
Dr. Stephen Henson	b7de76b74d	Add support for memory leak checking in fips_algvs. Fix many memory leaks in algorithm test utilities.	2011-11-02 19:16:43 +00:00
Dr. Stephen Henson	8ab0d50c43	Remove duplicate test from health check. Fix memory leaks by uninstantiating DRBG before reinitialising it.	2011-11-02 16:35:24 +00:00
Dr. Stephen Henson	cb47a7107f	Print out an error for "make test" in FIPS builds.	2011-11-02 00:43:45 +00:00
Dr. Stephen Henson	d5939062d7	Replace exit calls with return in fips_test_suite	2011-11-02 00:07:15 +00:00
Dr. Stephen Henson	8b8096d082	Add support for multicall fips_algvs utility combining functionality of all fips test utilities in a single binary and some minimal script parsing for platforms lacking a suitable shell. In order to keep changes to the build system to a minimum it #includes all the utilities C source files (yuck).	2011-11-01 13:45:30 +00:00
Dr. Stephen Henson	9ab6d6813e	PR: 2632 Submitted by: emmanuel.azencot@bull.net Reviewed by: steve Return -1 immediately if not affine coordinates as BN_CTX has not been set up.	2011-10-26 16:46:20 +00:00
Dr. Stephen Henson	45e5f551ac	Prepare for RC2.	2011-10-24 16:58:49 +00:00
Dr. Stephen Henson	51035e733c	prepare for RC1	2011-10-24 16:53:59 +00:00
Dr. Stephen Henson	319c7264b0	typo	2011-10-24 13:24:28 +00:00
cvs2svn	0684e77866	This commit was manufactured by cvs2svn to create branch 'OpenSSL-fips- 2_0-stable'.	2011-10-24 06:00:07 +00:00