Reorganize and speed up MD5.

Submitted by: Andy Polyakov <appro@fy.chalmers.se>
1999-05-13 13:16:42 +00:00 · 1999-05-13 13:16:42 +00:00 · bd3576d2dd
commit bd3576d2dd
parent 7d7d2cbcb0
10 changed files with 1872 additions and 345 deletions
--- a/3
+++ b/3
@ -5,6 +5,9 @@

 Changes between 0.9.2b and 0.9.3

+  *) Reorganize and speed up MD5.
+     [Andy Polyakov <appro@fy.chalmers.se>]
+
  *) VMS support.
     [Richard Levitte <richard@levitte.org>]

--- a/7
+++ b/7
@ -106,7 +106,7 @@ my %table=(
 # Solaris setups
 "solaris-x86-gcc","gcc:-O3 -fomit-frame-pointer -m486 -Wall -DL_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG $x86_gcc_des $x86_gcc_opts:$x86_sol_asm",
 "solaris-sparc-gcc","gcc:-O3 -fomit-frame-pointer -mv8 -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8.o::",
-"solaris-usparc-gcc","gcc:-O3 -fomit-frame-pointer -mcpu=ultrasparc -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o::",
+"solaris-usparc-gcc","gcc:-O3 -fomit-frame-pointer -mcpu=ultrasparc -Wall -DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o:::asm/md5-sparcv8plus.o:",
 "debug-solaris-sparc-gcc","gcc:-O3 -g -mv8 -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:::",
 "debug-solaris-usparc-gcc","gcc:-O3 -g -mcpu=ultrasparc -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o::",

@ -115,12 +115,11 @@ my %table=(
 # SC4 is ok, better than gcc even on bn as long as you tell it -xarch=v8
 # -fast slows things like DES down quite a lot
 # Don't use -xtarget=ultra with SC4.2. It is broken, and will break exptest.
-# SC5.0 with the compiler common patch works.
 "solaris-sparc-sc4","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
 "solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::",
 # SC5.0 note: Compiler common patch 107357-01 or later is required!
-"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::",
-"solaris64-usparc-sc5","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR:::",
+"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:",
+"solaris64-usparc-sc5","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:",

 # Sunos configs, assuming sparc for the gcc one.
 ##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST:(unknown)::DES_UNROLL:::",
--- a/crypto/md32_common.h
+++ b/crypto/md32_common.h
@ -0,0 +1,592 @@
+/* crypto/md32_common.h */
+/* ====================================================================
+ * Copyright (c) 1999 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+/*
+ * This is a generic 32 bit "collector" for message digest algorithms.
+ * Whenever needed it collects input character stream into chunks of
+ * 32 bit values and invokes a block function that performs actual hash
+ * calculations.
+ *
+ * Porting guide.
+ *
+ * Obligatory macros:
+ *
+ * DATA_ORDER_IS_BIG_ENDIAN or DATA_ORDER_IS_LITTLE_ENDIAN
+ *	this macro defines byte order of input stream.
+ * HASH_CBLOCK
+ *	size of a unit chunk HASH_BLOCK operates on.
+ * HASH_LONG
+ *	has to be at lest 32 bit wide, if it's wider, then
+ *	HASH_LONG_LOG2 *has to* be defined along
+ * HASH_CTX
+ *	context structure that at least contains following
+ *	members:
+ *		typedef struct {
+ *			...
+ *			HASH_LONG	Nl,Nh;
+ *			HASH_LONG	data[HASH_LBLOCK];
+ *			int		num;
+ *			...
+ *			} HASH_CTX;
+ * HASH_UPDATE
+ *	name of "Update" function, implemented here.
+ * HASH_TRANSFORM
+ *	name of "Transform" function, implemented here.
+ * HASH_FINAL
+ *	name of "Final" function, implemented here.
+ * HASH_BLOCK_HOST_ORDER
+ *	name of "block" function treating *aligned* input message
+ *	in host byte order, implemented externally.
+ * HASH_BLOCK_DATA_ORDER
+ *	name of "block" function treating *unaligned* input message
+ *	in original (data) byte order, implemented externally (it
+ *	actually is optional if data and host are of the same
+ *	"endianess").
+ *
+ * Optional macros:
+ *
+ * B_ENDIAN or L_ENDIAN
+ *	defines host byte-order.
+ * HASH_LONG_LOG2
+ *	defaults to 2 if not states otherwise.
+ * HASH_LBLOCK
+ *	assumed to be HASH_CBLOCK/4 if not stated otherwise.
+ * HASH_BLOCK_DATA_ORDER_ALIGNED
+ *	alternative "block" function capable of treating
+ *	aligned input message in original (data) order,
+ *	implemented externally.
+ *
+ * MD5 example:
+ *
+ *	#define DATA_ORDER_IS_LITTLE_ENDIAN
+ *
+ *	#define HASH_LONG		MD5_LONG
+ *	#define HASH_LONG_LOG2		MD5_LONG_LOG2
+ *	#define HASH_CTX		MD5_CTX
+ *	#define HASH_CBLOCK		MD5_CBLOCK
+ *	#define HASH_LBLOCK		MD5_LBLOCK
+ *	#define HASH_UPDATE		MD5_Update
+ *	#define HASH_TRANSFORM		MD5_Transform
+ *	#define HASH_FINAL		MD5_Final
+ *	#define HASH_BLOCK_HOST_ORDER	md5_block_host_order
+ *	#define HASH_BLOCK_DATA_ORDER	md5_block_data_order
+ *
+ *					<appro@fy.chalmers.se>
+ */
+
+#if !defined(DATA_ORDER_IS_BIG_ENDIAN) && !defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+#error "DATA_ORDER must be defined!"
+#endif
+
+#ifndef HASH_CBLOCK
+#error "HASH_CBLOCK must be defined!"
+#endif
+#ifndef HASH_LONG
+#error "HASH_LONG must be defined!"
+#endif
+#ifndef HASH_CTX
+#error "HASH_CTX must be defined!"
+#endif
+
+#ifndef HASH_UPDATE
+#error "HASH_UPDATE must be defined!"
+#endif
+#ifndef HASH_TRANSFORM
+#error "HASH_TRANSFORM must be defined!"
+#endif
+#ifndef HASH_FINAL
+#error "HASH_FINAL must be defined!"
+#endif
+
+#ifndef HASH_BLOCK_HOST_ORDER
+#error "HASH_BLOCK_HOST_ORDER must be defined!"
+#endif
+
+#if 0
+/*
+ * Moved below as it's required only if HASH_BLOCK_DATA_ORDER_ALIGNED
+ * isn't defined.
+ */
+#ifndef HASH_BLOCK_DATA_ORDER
+#error "HASH_BLOCK_DATA_ORDER must be defined!"
+#endif
+#endif
+
+#ifndef HASH_LBLOCK
+#define HASH_LBLOCK	(HASH_CBLOCK/4)
+#endif
+
+#ifndef HASH_LONG_LOG2
+#define HASH_LONG_LOG2	2
+#endif
+
+/*
+ * Engage compiler specific rotate intrinsic function if available.
+ */
+#undef ROTATE
+#ifndef PEDANTIC
+# if defined(_MSC_VER)
+#  define ROTATE(a,n)     _lrotl(a,n)
+# elif defined(__GNUC__) && __GNUC__>=2
+  /*
+   * Some GNU C inline assembler templates. Note that these are
+   * rotates by *constant* number of bits! But that's exactly
+   * what we need here...
+   *
+   * 					<appro@fy.chalmers.se>
+   */
+#  if defined(__i386)
+#   define ROTATE(a,n)	({ register unsigned int ret;	\
+				asm volatile (		\
+				"roll %1,%0"		\
+				: "=r"(ret)		\
+				: "I"(n), "0"(a)	\
+				: "cc");		\
+			   ret;				\
+			})
+#  elif defined(__powerpc)
+#   define ROTATE(a,n)	({ register unsigned int ret;	\
+				asm volatile (		\
+				"rlwinm %0,%1,%2,0,31"	\
+				: "=r"(ret)		\
+				: "r"(a), "I"(n));	\
+			   ret;				\
+			})
+#  endif
+# endif
+
+/*
+ * Engage compiler specific "fetch in reverse byte order"
+ * intrinsic function if available.
+ */
+# if defined(__GNUC__) && __GNUC__>=2
+  /* some GNU C inline assembler templates by <appro@fy.chalmers.se> */
+#  if defined(__i386) && !defined(I386_ONLY)
+#   define BE_FETCH32(a)	({ register unsigned int l=(a);\
+				asm volatile (		\
+				"bswapl %0"		\
+				: "=r"(l) : "0"(l));	\
+			  l;				\
+			})
+#  elif defined(__powerpc)
+#   define LE_FETCH32(a)	({ register unsigned int l;	\
+				asm volatile (		\
+				"lwbrx %0,0,%1"		\
+				: "=r"(l)		\
+				: "r"(a));		\
+			   l;				\
+			})
+
+#  elif defined(__sparc) && defined(ULTRASPARC)
+#  define LE_FETCH32(a)	({ register unsigned int l;		\
+				asm volatile (			\
+				"lda [%1]#ASI_PRIMARY_LITTLE,%0"\
+				: "=r"(l)			\
+				: "r"(a));			\
+			   l;					\
+			})
+#  endif
+# endif
+#endif /* PEDANTIC */
+
+#if HASH_LONG_LOG2==2	/* Engage only if sizeof(HASH_LONG)== 4 */
+/* A nice byte order reversal from Wei Dai <weidai@eskimo.com> */
+#ifdef ROTATE
+/* 5 instructions with rotate instruction, else 9 */
+#define REVERSE_FETCH32(a,l)	(					\
+		l=*(const HASH_LONG *)(a),				\
+		((ROTATE(l,8)&0x00FF00FF)|(ROTATE((l&0x00FF00FF),24)))	\
+				)
+#else
+/* 6 instructions with rotate instruction, else 8 */
+#define REVERSE_FETCH32(a,l)	(				\
+		l=*(const HASH_LONG *)(a),			\
+		l=(((l>>8)&0x00FF00FF)|((l&0x00FF00FF)<<8)),	\
+		ROTATE(l,16)					\
+				)
+/*
+ * Originally the middle line started with l=(((l&0xFF00FF00)>>8)|...
+ * It's rewritten as above for two reasons:
+ *	- RISCs aren't good at long constants and have to explicitely
+ *	  compose 'em with several (well, usually 2) instructions in a
+ *	  register before performing the actual operation and (as you
+ *	  already realized:-) having same constant should inspire the
+ *	  compiler to permanently allocate the only register for it;
+ *	- most modern CPUs have two ALUs, but usually only one has
+ *	  circuitry for shifts:-( this minor tweak inspires compiler
+ *	  to schedule shift instructions in a better way...
+ *
+ *				<appro@fy.chalmers.se>
+ */
+#endif
+#endif
+
+#ifndef ROTATE
+#define ROTATE(a,n)     (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
+#endif
+
+/*
+ * Make some obvious choices. E.g., HASH_BLOCK_DATA_ORDER_ALIGNED
+ * and HASH_BLOCK_HOST_ORDER ought to be the same if input data
+ * and host are of the same "endianess". It's possible to mask
+ * this with blank #define HASH_BLOCK_DATA_ORDER though...
+ *
+ *				<appro@fy.chalmers.se>
+ */
+#if defined(B_ENDIAN)
+#  if defined(DATA_ORDER_IS_BIG_ENDIAN)
+#    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
+#      define HASH_BLOCK_DATA_ORDER_ALIGNED	HASH_BLOCK_HOST_ORDER
+#    endif
+#  elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+#    ifndef HOST_FETCH32
+#      ifdef LE_FETCH32
+#        define HOST_FETCH32(p,l)	LE_FETCH32(p)
+#      elif defined(REVERSE_FETCH32)
+#        define HOST_FETCH32(p,l)	REVERSE_FETCH32(p,l)
+#      endif
+#    endif
+#  endif
+#elif defined(L_ENDIAN)
+#  if defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+#    if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2
+#      define HASH_BLOCK_DATA_ORDER_ALIGNED	HASH_BLOCK_HOST_ORDER
+#    endif
+#  elif defined(DATA_ORDER_IS_BIG_ENDIAN)
+#    ifndef HOST_FETCH32
+#      ifdef BE_FETCH32
+#        define HOST_FETCH32(p,l)	BE_FETCH32(p)
+#      elif defined(REVERSE_FETCH32)
+#        define HOST_FETCH32(p,l)	REVERSE_FETCH32(p,l)
+#      endif
+#    endif
+#  endif
+#endif
+
+#if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1
+#ifndef HASH_BLOCK_DATA_ORDER
+#error "HASH_BLOCK_DATA_ORDER must be defined!"
+#endif
+#endif
+
+#if defined(DATA_ORDER_IS_BIG_ENDIAN)
+
+#define HOST_c2l(c,l)	(l =(((unsigned long)(*((c)++)))<<24),		\
+			 l|=(((unsigned long)(*((c)++)))<<16),		\
+			 l|=(((unsigned long)(*((c)++)))<< 8),		\
+			 l|=(((unsigned long)(*((c)++)))    ),		\
+			 l)
+#define HOST_p_c2l(c,l,n)	{					\
+			switch (n) {					\
+			case 0: l =((unsigned long)(*((c)++)))<<24;	\
+			case 1: l|=((unsigned long)(*((c)++)))<<16;	\
+			case 2: l|=((unsigned long)(*((c)++)))<< 8;	\
+			case 3: l|=((unsigned long)(*((c)++)));		\
+				} }
+#define HOST_p_c2l_p(c,l,sc,len) {					\
+			switch (sc) {					\
+			case 0: l =((unsigned long)(*((c)++)))<<24;	\
+				if (--len == 0) break;			\
+			case 1: l|=((unsigned long)(*((c)++)))<<16;	\
+				if (--len == 0) break;			\
+			case 2: l|=((unsigned long)(*((c)++)))<< 8;	\
+				} }
+/* NOTE the pointer is not incremented at the end of this */
+#define HOST_c2l_p(c,l,n)	{					\
+			l=0; (c)+=n;					\
+			switch (n) {					\
+			case 3: l =((unsigned long)(*(--(c))))<< 8;	\
+			case 2: l|=((unsigned long)(*(--(c))))<<16;	\
+			case 1: l|=((unsigned long)(*(--(c))))<<24;	\
+				} }
+#define HOST_l2c(l,c)	(*((c)++)=(unsigned char)(((l)>>24)&0xff),	\
+			 *((c)++)=(unsigned char)(((l)>>16)&0xff),	\
+			 *((c)++)=(unsigned char)(((l)>> 8)&0xff),	\
+			 *((c)++)=(unsigned char)(((l)    )&0xff),	\
+			 l)
+
+#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+
+#define HOST_c2l(c,l)	(l =(((unsigned long)(*((c)++)))    ),		\
+			 l|=(((unsigned long)(*((c)++)))<< 8),		\
+			 l|=(((unsigned long)(*((c)++)))<<16),		\
+			 l|=(((unsigned long)(*((c)++)))<<24),		\
+			 l)
+#define HOST_p_c2l(c,l,n)	{					\
+			switch (n) {					\
+			case 0: l =((unsigned long)(*((c)++)));		\
+			case 1: l|=((unsigned long)(*((c)++)))<< 8;	\
+			case 2: l|=((unsigned long)(*((c)++)))<<16;	\
+			case 3: l|=((unsigned long)(*((c)++)))<<24;	\
+				} }
+#define HOST_p_c2l_p(c,l,sc,len) {					\
+			switch (sc) {					\
+			case 0: l =((unsigned long)(*((c)++)));		\
+				if (--len == 0) break;			\
+			case 1: l|=((unsigned long)(*((c)++)))<< 8;	\
+				if (--len == 0) break;			\
+			case 2: l|=((unsigned long)(*((c)++)))<<16;	\
+				} }
+/* NOTE the pointer is not incremented at the end of this */
+#define HOST_c2l_p(c,l,n)	{					\
+			l=0; (c)+=n;					\
+			switch (n) {					\
+			case 3: l =((unsigned long)(*(--(c))))<<16;	\
+			case 2: l|=((unsigned long)(*(--(c))))<< 8;	\
+			case 1: l|=((unsigned long)(*(--(c))));		\
+				} }
+#define HOST_l2c(l,c)	(*((c)++)=(unsigned char)(((l)    )&0xff),	\
+			 *((c)++)=(unsigned char)(((l)>> 8)&0xff),	\
+			 *((c)++)=(unsigned char)(((l)>>16)&0xff),	\
+			 *((c)++)=(unsigned char)(((l)>>24)&0xff),	\
+			 l)
+
+#endif
+
+/*
+ * Time for some action:-)
+ */
+
+void HASH_UPDATE (HASH_CTX *c, const unsigned char *data, unsigned long len)
+	{
+	register HASH_LONG * p;
+	register unsigned long l;
+	int sw,sc,ew,ec;
+
+	if (len==0) return;
+
+	l=(c->Nl+(len<<3))&0xffffffffL;
+	/* 95-05-24 eay Fixed a bug with the overflow handling, thanks to
+	 * Wei Dai <weidai@eskimo.com> for pointing it out. */
+	if (l < c->Nl) /* overflow */
+		c->Nh++;
+	c->Nh+=(len>>29);
+	c->Nl=l;
+
+	if (c->num != 0)
+		{
+		p=c->data;
+		sw=c->num>>2;
+		sc=c->num&0x03;
+
+		if ((c->num+len) >= HASH_CBLOCK)
+			{
+			l=p[sw]; HOST_p_c2l(data,l,sc); p[sw++]=l;
+			for (; sw<HASH_LBLOCK; sw++)
+				{
+				HOST_c2l(data,l); p[sw]=l;
+				}
+			HASH_BLOCK_HOST_ORDER (c,p,1);
+			len-=(HASH_CBLOCK-c->num);
+			c->num=0;
+			/* drop through and do the rest */
+			}
+		else
+			{
+			c->num+=len;
+			if ((sc+len) < 4) /* ugly, add char's to a word */
+				{
+				l=p[sw]; HOST_p_c2l_p(data,l,sc,len); p[sw]=l;
+				}
+			else
+				{
+				ew=(c->num>>2);
+				ec=(c->num&0x03);
+				l=p[sw]; HOST_p_c2l(data,l,sc); p[sw++]=l;
+				for (; sw < ew; sw++)
+					{
+					HOST_c2l(data,l); p[sw]=l;
+					}
+				if (ec)
+					{
+					HOST_c2l_p(data,l,ec); p[sw]=l;
+					}
+				}
+			return;
+			}
+		}
+
+	sw=len/HASH_CBLOCK;
+	if (sw > 0)
+		{
+#if defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1
+		/*
+		 * Note that HASH_BLOCK_DATA_ORDER_ALIGNED gets defined
+		 * only if sizeof(HASH_LONG)==4.
+		 */
+		if ((((unsigned long)data)%4) == 0)
+			{
+			HASH_BLOCK_DATA_ORDER_ALIGNED (c,(HASH_LONG *)data,sw);
+			sw*=HASH_CBLOCK;
+			data+=sw;
+			len-=sw;
+			}
+		else
+#if !defined(HASH_BLOCK_DATA_ORDER)
+			while (sw--)
+				{
+				memcpy (p=c->data,data,HASH_CBLOCK);
+				HASH_BLOCK_DATA_ORDER_ALIGNED(c,p,1);
+				data+=HASH_CBLOCK;
+				len-=HASH_CBLOCK;
+				}
+#endif
+#endif
+#if defined(HASH_BLOCK_DATA_ORDER)
+			{
+			HASH_BLOCK_DATA_ORDER (c,(HASH_LONG *)data,sw);
+			sw*=HASH_CBLOCK;
+			data+=sw;
+			len-=sw;
+			}
+#endif
+		}
+
+	if (len!=0)
+		{
+		p = c->data;
+		c->num = len;
+		ew=len>>2;	/* words to copy */
+		ec=len&0x03;
+		for (; ew; ew--,p++)
+			{
+			HOST_c2l(data,l); *p=l;
+			}
+		HOST_c2l_p(data,l,ec);
+		*p=l;
+		}
+	}
+
+
+void HASH_TRANSFORM (HASH_CTX *c, unsigned char *data)
+	{
+#if defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1
+	if ((((unsigned long)data)%4) == 0)
+		HASH_BLOCK_DATA_ORDER_ALIGNED (c,(HASH_LONG *)data,1);
+	else
+#if !defined(HASH_BLOCK_DATA_ORDER)
+		{
+		memcpy (c->data,data,HASH_CBLOCK);
+		HASH_BLOCK_DATA_ORDER_ALIGNED (c,c->data,1);
+		}
+#endif
+#endif
+#if defined(HASH_BLOCK_DATA_ORDER)
+	HASH_BLOCK_DATA_ORDER (c,(HASH_LONG *)data,1);
+#endif
+	}
+
+
+void HASH_FINAL (unsigned char *md, HASH_CTX *c)
+	{
+	register HASH_LONG *p;
+	register unsigned long l;
+	register int i,j;
+	static const unsigned char end[4]={0x80,0x00,0x00,0x00};
+	const unsigned char *cp=end;
+
+	/* c->num should definitly have room for at least one more byte. */
+	p=c->data;
+	i=c->num>>2;
+	j=c->num&0x03;
+
+#if 0
+	/* purify often complains about the following line as an
+	 * Uninitialized Memory Read.  While this can be true, the
+	 * following p_c2l macro will reset l when that case is true.
+	 * This is because j&0x03 contains the number of 'valid' bytes
+	 * already in p[i].  If and only if j&0x03 == 0, the UMR will
+	 * occur but this is also the only time p_c2l will do
+	 * l= *(cp++) instead of l|= *(cp++)
+	 * Many thanks to Alex Tang <altitude@cic.net> for pickup this
+	 * 'potential bug' */
+#ifdef PURIFY
+	if (j==0) p[i]=0; /* Yeah, but that's not the way to fix it:-) */
+#endif
+	l=p[i];
+#else
+	l = (j==0) ? 0 : p[i];
+#endif
+	HOST_p_c2l(cp,l,j); p[i++]=l; /* i is the next 'undefined word' */
+
+	if (i>(HASH_LBLOCK-2)) /* save room for Nl and Nh */
+		{
+		if (i<HASH_LBLOCK) p[i]=0;
+		HASH_BLOCK_HOST_ORDER (c,p,1);
+		i=0;
+		}
+	for (; i<(HASH_LBLOCK-2); i++)
+		p[i]=0;
+
+#if   defined(DATA_ORDER_IS_BIG_ENDIAN)
+	p[HASH_LBLOCK-2]=c->Nh;
+	p[HASH_LBLOCK-1]=c->Nl;
+#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
+	p[HASH_LBLOCK-2]=c->Nl;
+	p[HASH_LBLOCK-1]=c->Nh;
+#endif
+	HASH_BLOCK_HOST_ORDER (c,p,1);
+
+	l=c->A; HOST_l2c(l,md);
+	l=c->B; HOST_l2c(l,md);
+	l=c->C; HOST_l2c(l,md);
+	l=c->D; HOST_l2c(l,md);
+
+	c->num=0;
+	/* clear stuff, HASH_BLOCK may be leaving some stuff on the stack
+	 * but I'm not worried :-)
+	memset((void *)c,0,sizeof(HASH_CTX));
+	 */
+	}
--- a/crypto/md5/Makefile.ssl
+++ b/crypto/md5/Makefile.ssl
@ -66,6 +66,14 @@ asm/mx86bsdi.o: asm/mx86unix.cpp
 asm/mx86unix.cpp: asm/md5-586.pl
 	(cd asm; $(PERL) md5-586.pl cpp >mx86unix.cpp)

+# works for both SC and gcc
+asm/md5-sparcv8plus.o: asm/md5-sparcv9.S
+	$(CPP) -DULTRASPARC -DMD5_BLOCK_DATA_ORDER asm/md5-sparcv9.S | as -xarch=v8plus /dev/fd/0 -o asm/md5-sparcv8plus.o
+
+asm/md5-sparcv9.o: asm/md5-sparcv9.S
+	$(CC) -xarch=v9 -DULTRASPARC -DMD5_BLOCK_DATA_ORDER -c asm/md5-sparcv9.S -o asm/md5-sparcv9.o
+
+
 files:
 	$(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO

@ -103,5 +111,5 @@ clean:
 # DO NOT DELETE THIS LINE -- make depend depends on it.

 md5_dgst.o: ../../include/openssl/md5.h ../../include/openssl/opensslv.h
-md5_dgst.o: md5_locl.h
+md5_dgst.o: ../md32_common.h md5_locl.h
 md5_one.o: ../../include/openssl/md5.h md5_locl.h
--- a/crypto/md5/asm/md5-586.pl
+++ b/crypto/md5/asm/md5-586.pl
@ -29,7 +29,7 @@ $X="esi";
 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9,	# R3
 );

-&md5_block("md5_block_x86");
+&md5_block("md5_block_asm_host_order");
 &asm_finish();

 sub Np
@ -183,6 +183,7 @@ sub md5_block
 	 &mov($X,	&wparam(1)); # esi
 	&mov($C,	&wparam(2));
 	 &push("ebp");
+	&shl($C,	6);
 	&push("ebx");
 	 &add($C,	$X); # offset we end at
 	&sub($C,	64);
--- a/crypto/md5/asm/md5-sparcv9.S
+++ b/crypto/md5/asm/md5-sparcv9.S
--- a/crypto/md5/md5.h
+++ b/crypto/md5/md5.h
@ -67,23 +67,43 @@ extern "C" {
 #error MD5 is disabled.
 #endif

+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ * ! MD5_LONG has to be at least 32 bits wide. If it's wider, then !
+ * ! MD5_LONG_LOG2 has to be defined along.			   !
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#if defined(WIN16) || defined(__LP32__)
+#define MD5_LONG unsigned long
+#elif defined(_CRAY) || defined(__ILP64__)
+#define MD5_LONG unsigned long
+#define MD5_LONG_LOG2 3
+/*
+ * _CRAY note. I could declare short, but I have no idea what impact
+ * does it have on performance on none-T3E machines. I could declare
+ * int, but at least on C90 sizeof(int) can be chosen at compile time.
+ * So I've chosen long...
+ *					<appro@fy.chalmers.se>
+ */
+#else
+#define MD5_LONG unsigned int
+#endif
+
 #define MD5_CBLOCK	64
-#define MD5_LBLOCK	16
-#define MD5_BLOCK	16
-#define MD5_LAST_BLOCK  56
-#define MD5_LENGTH_BLOCK 8
+#define MD5_LBLOCK	(MD5_CBLOCK/4)
 #define MD5_DIGEST_LENGTH 16

 typedef struct MD5state_st
 	{
-	unsigned long A,B,C,D;
-	unsigned long Nl,Nh;
-	unsigned long data[MD5_LBLOCK];
+	MD5_LONG A,B,C,D;
+	MD5_LONG Nl,Nh;
+	MD5_LONG data[MD5_LBLOCK];
 	int num;
 	} MD5_CTX;

 void MD5_Init(MD5_CTX *c);
-void MD5_Update(MD5_CTX *c, const void *data, unsigned long len);
+void MD5_Update(MD5_CTX *c, const unsigned char *data, unsigned long len);
 void MD5_Final(unsigned char *md, MD5_CTX *c);
 unsigned char *MD5(unsigned char *d, unsigned long n, unsigned char *md);
 void MD5_Transform(MD5_CTX *c, unsigned char *b);
--- a/crypto/md5/md5_dgst.c
+++ b/crypto/md5/md5_dgst.c
@ -70,12 +70,6 @@ char *MD5_version="MD5" OPENSSL_VERSION_PTEXT;
 #define INIT_DATA_C (unsigned long)0x98badcfeL
 #define INIT_DATA_D (unsigned long)0x10325476L

-#  ifdef MD5_ASM
-     void md5_block_x86(MD5_CTX *c, unsigned long *p,int num);
-#    define md5_block md5_block_x86
-#  else
-     static void md5_block(MD5_CTX *c, unsigned long *p,int num);
-#  endif
 void MD5_Init(MD5_CTX *c)
 	{
 	c->A=INIT_DATA_A;
@ -87,183 +81,31 @@ void MD5_Init(MD5_CTX *c)
 	c->num=0;
 	}

-void MD5_Update(MD5_CTX *c, const void *_data, unsigned long len)
+#ifndef md5_block_host_order
+void md5_block_host_order (MD5_CTX *c, const MD5_LONG *X, int num)
 	{
-	register const unsigned char *data=_data;
-	register ULONG *p;
-	int sw,sc;
-	ULONG l;
-
-	if (len == 0) return;
-
-	l=(c->Nl+(len<<3))&0xffffffffL;
-	/* 95-05-24 eay Fixed a bug with the overflow handling, thanks to
-	 * Wei Dai <weidai@eskimo.com> for pointing it out. */
-	if (l < c->Nl) /* overflow */
-		c->Nh++;
-	c->Nh+=(len>>29);
-	c->Nl=l;
-
-	if (c->num != 0)
-		{
-		p=c->data;
-		sw=c->num>>2;
-		sc=c->num&0x03;
-
-		if ((c->num+len) >= MD5_CBLOCK)
-			{
-			l= p[sw];
-			p_c2l(data,l,sc);
-			p[sw++]=l;
-			for (; sw<MD5_LBLOCK; sw++)
-				{
-				c2l(data,l);
-				p[sw]=l;
-				}
-			len-=(MD5_CBLOCK-c->num);
-
-			md5_block(c,p,64);
-			c->num=0;
-			/* drop through and do the rest */
-			}
-		else
-			{
-			int ew,ec;
-
-			c->num+=(int)len;
-			if ((sc+len) < 4) /* ugly, add char's to a word */
-				{
-				l= p[sw];
-				p_c2l_p(data,l,sc,len);
-				p[sw]=l;
-				}
-			else
-				{
-				ew=(c->num>>2);
-				ec=(c->num&0x03);
-				l= p[sw];
-				p_c2l(data,l,sc);
-				p[sw++]=l;
-				for (; sw < ew; sw++)
-					{ c2l(data,l); p[sw]=l; }
-				if (ec)
-					{
-					c2l_p(data,l,ec);
-					p[sw]=l;
-					}
-				}
-			return;
-			}
-		}
-	/* we now can process the input data in blocks of MD5_CBLOCK
-	 * chars and save the leftovers to c->data. */
-#ifdef L_ENDIAN
-	if ((((unsigned long)data)%sizeof(ULONG)) == 0)
-		{
-		sw=(int)len/MD5_CBLOCK;
-		if (sw > 0)
-			{
-			sw*=MD5_CBLOCK;
-			md5_block(c,(ULONG *)data,sw);
-			data+=sw;
-			len-=sw;
-			}
-		}
-#endif
-	p=c->data;
-	while (len >= MD5_CBLOCK)
-		{
-#if defined(L_ENDIAN) || defined(B_ENDIAN)
-		if (p != (unsigned long *)data)
-			memcpy(p,data,MD5_CBLOCK);
-		data+=MD5_CBLOCK;
-#ifdef B_ENDIAN
-		for (sw=(MD5_LBLOCK/4); sw; sw--)
-			{
-			Endian_Reverse32(p[0]);
-			Endian_Reverse32(p[1]);
-			Endian_Reverse32(p[2]);
-			Endian_Reverse32(p[3]);
-			p+=4;
-			}
-#endif
-#else
-		for (sw=(MD5_LBLOCK/4); sw; sw--)
-			{
-			c2l(data,l); *(p++)=l;
-			c2l(data,l); *(p++)=l;
-			c2l(data,l); *(p++)=l;
-			c2l(data,l); *(p++)=l; 
-			} 
-#endif
-		p=c->data;
-		md5_block(c,p,64);
-		len-=MD5_CBLOCK;
-		}
-	sc=(int)len;
-	c->num=sc;
-	if (sc)
-		{
-		sw=sc>>2;	/* words to copy */
-#ifdef L_ENDIAN
-		p[sw]=0;
-		memcpy(p,data,sc);
-#else
-		sc&=0x03;
-		for ( ; sw; sw--)
-			{ c2l(data,l); *(p++)=l; }
-		c2l_p(data,l,sc);
-		*p=l;
-#endif
-		}
-	}
-
-void MD5_Transform(MD5_CTX *c, unsigned char *b)
-	{
-	ULONG p[16];
-#if !defined(L_ENDIAN)
-	ULONG *q;
-	int i;
-#endif
-
-#if defined(B_ENDIAN) || defined(L_ENDIAN)
-	memcpy(p,b,64);
-#ifdef B_ENDIAN
-	q=p;
-	for (i=(MD5_LBLOCK/4); i; i--)
-		{
-		Endian_Reverse32(q[0]);
-		Endian_Reverse32(q[1]);
-		Endian_Reverse32(q[2]);
-		Endian_Reverse32(q[3]);
-		q+=4;
-		}
-#endif
-#else
-	q=p;
-	for (i=(MD5_LBLOCK/4); i; i--)
-		{
-		ULONG l;
-		c2l(b,l); *(q++)=l;
-		c2l(b,l); *(q++)=l;
-		c2l(b,l); *(q++)=l;
-		c2l(b,l); *(q++)=l; 
-		} 
-#endif
-	md5_block(c,p,64);
-	}
-
-#ifndef MD5_ASM
-
-static void md5_block(MD5_CTX *c, register ULONG *X, int num)
-	{
-	register ULONG A,B,C,D;
+	register unsigned long A,B,C,D;
+	/*
+	 * In case you wonder why A-D are declared as long and not
+	 * as MD5_LONG. Doing so results in slight performance
+	 * boost on LP64 architectures. The catch is we don't
+	 * really care if 32 MSBs of a 64-bit register get polluted
+	 * with eventual overflows as we *save* only 32 LSBs in
+	 * *either* case. Now declaring 'em long excuses the compiler
+	 * from keeping 32 MSBs zeroed resulting in 13% performance
+	 * improvement under SPARC Solaris7/64 and 5% under AlphaLinux.
+	 * Well, to be honest it should say that this *prevents* 
+	 * performance degradation.
+	 *
+	 *				<appro@fy.chalmers.se>
+	 */

 	A=c->A;
 	B=c->B;
 	C=c->C;
 	D=c->D;
-	for (;;)
+
+	for (;num--;X+=HASH_LBLOCK)
 		{
 	/* Round 0 */
 	R0(A,B,C,D,X[ 0], 7,0xd76aa478L);
@ -334,74 +176,127 @@ static void md5_block(MD5_CTX *c, register ULONG *X, int num)
 	R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL);
 	R3(B,C,D,A,X[ 9],21,0xeb86d391L);

-	A+=c->A&0xffffffffL;
-	B+=c->B&0xffffffffL;
-	c->A=A;
-	c->B=B;
-	C+=c->C&0xffffffffL;
-	D+=c->D&0xffffffffL;
-	c->C=C;
-	c->D=D;
-	X+=16;
-	num-=64;
-	if (num <= 0) break;
+	A = c->A += A;
+	B = c->B += B;
+	C = c->C += C;
+	D = c->D += D;
 		}
 	}
 #endif

-void MD5_Final(unsigned char *md, MD5_CTX *c)
+#ifndef md5_block_data_order
+void md5_block_data_order (MD5_CTX *c, const unsigned char *data, int num)
 	{
-	register int i,j;
-	register ULONG l;
-	register ULONG *p;
-	static unsigned char end[4]={0x80,0x00,0x00,0x00};
-	unsigned char *cp=end;
+	register unsigned long A,B,C,D,l;
+	/*
+	 * In case you wonder why A-D are declared as long and not
+	 * as MD5_LONG. Doing so results in slight performance
+	 * boost on LP64 architectures. The catch is we don't
+	 * really care if 32 MSBs of a 64-bit register get polluted
+	 * with eventual overflows as we *save* only 32 LSBs in
+	 * *either* case. Now declaring 'em long excuses the compiler
+	 * from keeping 32 MSBs zeroed resulting in 13% performance
+	 * improvement under SPARC Solaris7/64 and 5% under AlphaLinux.
+	 * Well, to be honest it should say that this *prevents* 
+	 * performance degradation.
+	 *
+	 *				<appro@fy.chalmers.se>
+	 */
+	MD5_LONG X[MD5_LBLOCK];
+	/*
+	 * In case you wonder why don't I use c->data for this.
+	 * RISCs usually have a handful of registers and if X is
+	 * declared as automatic array good optimizing compiler
+	 * shall accomodate at least part of it in register bank
+	 * instead of memory.
+	 *
+	 *				<appro@fy.chalmers.se>
+	 */

-	/* c->num should definitly have room for at least one more byte. */
-	p=c->data;
-	j=c->num;
-	i=j>>2;
+	A=c->A;
+	B=c->B;
+	C=c->C;
+	D=c->D;

-	/* purify often complains about the following line as an
-	 * Uninitialized Memory Read.  While this can be true, the
-	 * following p_c2l macro will reset l when that case is true.
-	 * This is because j&0x03 contains the number of 'valid' bytes
-	 * already in p[i].  If and only if j&0x03 == 0, the UMR will
-	 * occur but this is also the only time p_c2l will do
-	 * l= *(cp++) instead of l|= *(cp++)
-	 * Many thanks to Alex Tang <altitude@cic.net> for pickup this
-	 * 'potential bug' */
-#ifdef PURIFY
-	if ((j&0x03) == 0) p[i]=0;
-#endif
-	l=p[i];
-	p_c2l(cp,l,j&0x03);
-	p[i]=l;
-	i++;
-	/* i is the next 'undefined word' */
-	if (c->num >= MD5_LAST_BLOCK)
+	for (;num--;)
 		{
-		for (; i<MD5_LBLOCK; i++)
-			p[i]=0;
-		md5_block(c,p,64);
-		i=0;
-		}
-	for (; i<(MD5_LBLOCK-2); i++)
-		p[i]=0;
-	p[MD5_LBLOCK-2]=c->Nl;
-	p[MD5_LBLOCK-1]=c->Nh;
-	md5_block(c,p,64);
-	cp=md;
-	l=c->A; l2c(l,cp);
-	l=c->B; l2c(l,cp);
-	l=c->C; l2c(l,cp);
-	l=c->D; l2c(l,cp);
+	HOST_c2l(data,l); X[ 0]=l;		HOST_c2l(data,l); X[ 1]=l;
+	/* Round 0 */
+	R0(A,B,C,D,X[ 0], 7,0xd76aa478L);	HOST_c2l(data,l); X[ 2]=l;
+	R0(D,A,B,C,X[ 1],12,0xe8c7b756L);	HOST_c2l(data,l); X[ 3]=l;
+	R0(C,D,A,B,X[ 2],17,0x242070dbL);	HOST_c2l(data,l); X[ 4]=l;
+	R0(B,C,D,A,X[ 3],22,0xc1bdceeeL);	HOST_c2l(data,l); X[ 5]=l;
+	R0(A,B,C,D,X[ 4], 7,0xf57c0fafL);	HOST_c2l(data,l); X[ 6]=l;
+	R0(D,A,B,C,X[ 5],12,0x4787c62aL);	HOST_c2l(data,l); X[ 7]=l;
+	R0(C,D,A,B,X[ 6],17,0xa8304613L);	HOST_c2l(data,l); X[ 8]=l;
+	R0(B,C,D,A,X[ 7],22,0xfd469501L);	HOST_c2l(data,l); X[ 9]=l;
+	R0(A,B,C,D,X[ 8], 7,0x698098d8L);	HOST_c2l(data,l); X[10]=l;
+	R0(D,A,B,C,X[ 9],12,0x8b44f7afL);	HOST_c2l(data,l); X[11]=l;
+	R0(C,D,A,B,X[10],17,0xffff5bb1L);	HOST_c2l(data,l); X[12]=l;
+	R0(B,C,D,A,X[11],22,0x895cd7beL);	HOST_c2l(data,l); X[13]=l;
+	R0(A,B,C,D,X[12], 7,0x6b901122L);	HOST_c2l(data,l); X[14]=l;
+	R0(D,A,B,C,X[13],12,0xfd987193L);	HOST_c2l(data,l); X[15]=l;
+	R0(C,D,A,B,X[14],17,0xa679438eL);
+	R0(B,C,D,A,X[15],22,0x49b40821L);
+	/* Round 1 */
+	R1(A,B,C,D,X[ 1], 5,0xf61e2562L);
+	R1(D,A,B,C,X[ 6], 9,0xc040b340L);
+	R1(C,D,A,B,X[11],14,0x265e5a51L);
+	R1(B,C,D,A,X[ 0],20,0xe9b6c7aaL);
+	R1(A,B,C,D,X[ 5], 5,0xd62f105dL);
+	R1(D,A,B,C,X[10], 9,0x02441453L);
+	R1(C,D,A,B,X[15],14,0xd8a1e681L);
+	R1(B,C,D,A,X[ 4],20,0xe7d3fbc8L);
+	R1(A,B,C,D,X[ 9], 5,0x21e1cde6L);
+	R1(D,A,B,C,X[14], 9,0xc33707d6L);
+	R1(C,D,A,B,X[ 3],14,0xf4d50d87L);
+	R1(B,C,D,A,X[ 8],20,0x455a14edL);
+	R1(A,B,C,D,X[13], 5,0xa9e3e905L);
+	R1(D,A,B,C,X[ 2], 9,0xfcefa3f8L);
+	R1(C,D,A,B,X[ 7],14,0x676f02d9L);
+	R1(B,C,D,A,X[12],20,0x8d2a4c8aL);
+	/* Round 2 */
+	R2(A,B,C,D,X[ 5], 4,0xfffa3942L);
+	R2(D,A,B,C,X[ 8],11,0x8771f681L);
+	R2(C,D,A,B,X[11],16,0x6d9d6122L);
+	R2(B,C,D,A,X[14],23,0xfde5380cL);
+	R2(A,B,C,D,X[ 1], 4,0xa4beea44L);
+	R2(D,A,B,C,X[ 4],11,0x4bdecfa9L);
+	R2(C,D,A,B,X[ 7],16,0xf6bb4b60L);
+	R2(B,C,D,A,X[10],23,0xbebfbc70L);
+	R2(A,B,C,D,X[13], 4,0x289b7ec6L);
+	R2(D,A,B,C,X[ 0],11,0xeaa127faL);
+	R2(C,D,A,B,X[ 3],16,0xd4ef3085L);
+	R2(B,C,D,A,X[ 6],23,0x04881d05L);
+	R2(A,B,C,D,X[ 9], 4,0xd9d4d039L);
+	R2(D,A,B,C,X[12],11,0xe6db99e5L);
+	R2(C,D,A,B,X[15],16,0x1fa27cf8L);
+	R2(B,C,D,A,X[ 2],23,0xc4ac5665L);
+	/* Round 3 */
+	R3(A,B,C,D,X[ 0], 6,0xf4292244L);
+	R3(D,A,B,C,X[ 7],10,0x432aff97L);
+	R3(C,D,A,B,X[14],15,0xab9423a7L);
+	R3(B,C,D,A,X[ 5],21,0xfc93a039L);
+	R3(A,B,C,D,X[12], 6,0x655b59c3L);
+	R3(D,A,B,C,X[ 3],10,0x8f0ccc92L);
+	R3(C,D,A,B,X[10],15,0xffeff47dL);
+	R3(B,C,D,A,X[ 1],21,0x85845dd1L);
+	R3(A,B,C,D,X[ 8], 6,0x6fa87e4fL);
+	R3(D,A,B,C,X[15],10,0xfe2ce6e0L);
+	R3(C,D,A,B,X[ 6],15,0xa3014314L);
+	R3(B,C,D,A,X[13],21,0x4e0811a1L);
+	R3(A,B,C,D,X[ 4], 6,0xf7537e82L);
+	R3(D,A,B,C,X[11],10,0xbd3af235L);
+	R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL);
+	R3(B,C,D,A,X[ 9],21,0xeb86d391L);

-	/* clear stuff, md5_block may be leaving some stuff on the stack
-	 * but I'm not worried :-) */
-	c->num=0;
-/*	memset((char *)&c,0,sizeof(c));*/
+	A = c->A += A;
+	B = c->B += B;
+	C = c->C += C;
+	D = c->D += D;
+		}
 	}
+#endif

 #ifdef undef
 int printit(unsigned long *l)
--- a/crypto/md5/md5_locl.h
+++ b/crypto/md5/md5_locl.h
@ -56,98 +56,79 @@
 * [including the GNU Public Licence.]
 */

-/* On sparc, this actually slows things down :-( */
-#if defined(sun)
-#undef B_ENDIAN
-#endif
-
 #include <stdlib.h>
 #include <string.h>
 #include <openssl/md5.h>

-#define ULONG	unsigned long
-#define UCHAR	unsigned char
-#define UINT	unsigned int
-
-#undef c2l
-#define c2l(c,l)	(l = ((unsigned long)(*((c)++)))     , \
-			 l|=(((unsigned long)(*((c)++)))<< 8), \
-			 l|=(((unsigned long)(*((c)++)))<<16), \
-			 l|=(((unsigned long)(*((c)++)))<<24))
-
-#undef p_c2l
-#define p_c2l(c,l,n)	{ \
-			switch (n) { \
-			case 0: l =((unsigned long)(*((c)++))); \
-			case 1: l|=((unsigned long)(*((c)++)))<< 8; \
-			case 2: l|=((unsigned long)(*((c)++)))<<16; \
-			case 3: l|=((unsigned long)(*((c)++)))<<24; \
-				} \
-			}
-
-/* NOTE the pointer is not incremented at the end of this */
-#undef c2l_p
-#define c2l_p(c,l,n)	{ \
-			l=0; \
-			(c)+=n; \
-			switch (n) { \
-			case 3: l =((unsigned long)(*(--(c))))<<16; \
-			case 2: l|=((unsigned long)(*(--(c))))<< 8; \
-			case 1: l|=((unsigned long)(*(--(c))))    ; \
-				} \
-			}
-
-#undef p_c2l_p
-#define p_c2l_p(c,l,sc,len) { \
-			switch (sc) \
-				{ \
-			case 0: l =((unsigned long)(*((c)++))); \
-				if (--len == 0) break; \
-			case 1: l|=((unsigned long)(*((c)++)))<< 8; \
-				if (--len == 0) break; \
-			case 2: l|=((unsigned long)(*((c)++)))<<16; \
-				} \
-			}
-
-#undef l2c
-#define l2c(l,c)	(*((c)++)=(unsigned char)(((l)    )&0xff), \
-			 *((c)++)=(unsigned char)(((l)>> 8)&0xff), \
-			 *((c)++)=(unsigned char)(((l)>>16)&0xff), \
-			 *((c)++)=(unsigned char)(((l)>>24)&0xff))
-
-/* NOTE - c is not incremented as per l2c */
-#undef l2cn
-#define l2cn(l1,l2,c,n)	{ \
-			c+=n; \
-			switch (n) { \
-			case 8: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \
-			case 7: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \
-			case 6: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \
-			case 5: *(--(c))=(unsigned char)(((l2)    )&0xff); \
-			case 4: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \
-			case 3: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \
-			case 2: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \
-			case 1: *(--(c))=(unsigned char)(((l1)    )&0xff); \
-				} \
-			}
-
-/* A nice byte order reversal from Wei Dai <weidai@eskimo.com> */
-#if defined(WIN32)
-/* 5 instructions with rotate instruction, else 9 */
-#define Endian_Reverse32(a) \
-	{ \
-	unsigned long l=(a); \
-	(a)=((ROTATE(l,8)&0x00FF00FF)|(ROTATE(l,24)&0xFF00FF00)); \
-	}
-#else
-/* 6 instructions with rotate instruction, else 8 */
-#define Endian_Reverse32(a) \
-	{ \
-	unsigned long l=(a); \
-	l=(((l&0xFF00FF00)>>8L)|((l&0x00FF00FF)<<8L)); \
-	(a)=ROTATE(l,16L); \
-	}
+#ifndef MD5_LONG_LOG2
+#define MD5_LONG_LOG2 2 /* default to 32 bits */
 #endif
+
+#ifdef MD5_ASM
+# if defined(__i386) || defined(WIN32)
+#  define md5_block_host_order md5_block_asm_host_order
+# elif defined(__sparc) && defined(ULTRASPARC)
+   void md5_block_asm_data_order_aligned (MD5_CTX *c, const MD5_LONG *p,int num);
+#  define HASH_BLOCK_DATA_ORDER_ALIGNED md5_block_asm_data_order_aligned
+# endif
+#endif
+
+void md5_block_host_order (MD5_CTX *c, const MD5_LONG *p,int num);
+void md5_block_data_order (MD5_CTX *c, const unsigned char *p,int num);
+
+#if defined(__i386)
+/*
+ * *_block_host_order is expected to handle aligned data while
+ * *_block_data_order - unaligned. As algorithm and host (x86)
+ * are in this case of the same "endianess" these two are
+ * otherwise indistinguishable. But normally you don't want to
+ * call the same function because unaligned access in places
+ * where alignment is expected is usually a "Bad Thing". Indeed,
+ * on RISCs you get punished with BUS ERROR signal or *severe*
+ * performance degradation. Intel CPUs are in turn perfectly
+ * capable of loading unaligned data without such drastic side
+ * effect. Yes, they say it's slower than aligned load, but no
+ * exception is generated and therefore performance degradation
+ * is *incomparable* with RISCs. What we should weight here is
+ * costs of unaligned access against costs of aligning data.
+ * According to my measurements allowing unaligned access results
+ * in ~9% performance improvement on Pentium II operating at
+ * 266MHz. I won't be surprised if the difference will be higher
+ * on faster systems:-)
+ *
+ *				<appro@fy.chalmers.se>
+ */
+#define md5_block_data_order	md5_block_host_order
+#endif
+
+#define DATA_ORDER_IS_LITTLE_ENDIAN
+
+#define HASH_LONG		MD5_LONG
+#define HASH_LONG_LOG2		MD5_LONG_LOG2
+#define HASH_CTX		MD5_CTX
+#define HASH_CBLOCK		MD5_CBLOCK
+#define HASH_LBLOCK		MD5_LBLOCK
+#define HASH_UPDATE		MD5_Update
+#define HASH_TRANSFORM		MD5_Transform
+#define HASH_FINAL		MD5_Final
+#define HASH_BLOCK_HOST_ORDER	md5_block_host_order
+#if defined(B_ENDIAN) || defined(md5_block_data_order)
+#define	HASH_BLOCK_DATA_ORDER	md5_block_data_order
+/*
+ * Little-endians (Intel and Alpha) feel better without this.
+ * It looks like memcpy does better job than generic
+ * md5_block_data_order on copying-n-aligning input data.
+ * But franlky speaking I didn't expect such result on Alpha.
+ * On the other hand I've got this with egcs-1.0.2 and if
+ * program is compiled with another (better?) compiler it
+ * might turn out other way around.
+ *
+ *				<appro@fy.chalmers.se>
+ */
+#endif
+
+#include "../md32_common.h"
+
 /*
 #define	F(x,y,z)	(((x) & (y))  |  ((~(x)) & (z)))
 #define	G(x,y,z)	(((x) & (z))  |  ((y) & (~(z))))
@ -162,14 +143,6 @@
 #define	H(b,c,d)	((b) ^ (c) ^ (d))
 #define	I(b,c,d)	(((~(d)) | (b)) ^ (c))

-#undef ROTATE
-#if defined(WIN32)
-#define ROTATE(a,n)     _lrotl(a,n)
-#else
-#define ROTATE(a,n)     (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
-#endif
-
-
 #define R0(a,b,c,d,k,s,t) { \
 	a+=((k)+(t)+F((b),(c),(d))); \
 	a=ROTATE(a,s); \
--- a/crypto/md5/md5_one.c
+++ b/crypto/md5/md5_one.c
@ -57,7 +57,8 @@
 */

 #include <stdio.h>
-#include "md5_locl.h"
+#include <string.h>
+#include <openssl/md5.h>

 unsigned char *MD5(unsigned char *d, unsigned long n, unsigned char *md)
 	{