Support for "multiply high" instruction, see BN_UMULT_HIGH comment in

crypto/bn/bn_lcl.h for further details. It should be noted that for
the moment of this writing the code was tested only on Alpha. If
compiled with DEC C the C implementation exhibits 12% performance
improvement over the crypto/bn/asm/alpha.s (on EV56 box running
AlphaLinux). GNU C is (unfortunately) 8% behind the assembler
implementation. But it's OpenVMS Alpha users who *may* benefit most
as 'apps/openssl speed rsa' exhibits 6 (six) times performance
improvement over the original VMS bignum implementation. Where "*may*"
means "as soon as code is enabled though #define SIXTY_FOUR_BIT and
crypto/bn/asm/vms.mar is skipped."
This commit is contained in:
Andy Polyakov 2000-02-02 16:18:12 +00:00
parent 54a34aecc3
commit fb81ac5e6b
3 changed files with 151 additions and 37 deletions

View file

@ -60,7 +60,7 @@
#include "cryptlib.h"
#include "bn_lcl.h"
#ifdef BN_LLONG
#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
{
@ -69,18 +69,19 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
bn_check_num(num);
if (num <= 0) return(c1);
for (;;)
while (num&~3)
{
mul_add(rp[0],ap[0],w,c1);
if (--num == 0) break;
mul_add(rp[1],ap[1],w,c1);
if (--num == 0) break;
mul_add(rp[2],ap[2],w,c1);
if (--num == 0) break;
mul_add(rp[3],ap[3],w,c1);
if (--num == 0) break;
ap+=4;
rp+=4;
ap+=4; rp+=4; num-=4;
}
if (num)
{
mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
mul_add(rp[2],ap[2],w,c1); return c1;
}
return(c1);
@ -93,19 +94,19 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
bn_check_num(num);
if (num <= 0) return(c1);
/* for (;;) */
while (1) /* circumvent egcs-1.1.2 bug */
while (num&~3)
{
mul(rp[0],ap[0],w,c1);
if (--num == 0) break;
mul(rp[1],ap[1],w,c1);
if (--num == 0) break;
mul(rp[2],ap[2],w,c1);
if (--num == 0) break;
mul(rp[3],ap[3],w,c1);
if (--num == 0) break;
ap+=4;
rp+=4;
ap+=4; rp+=4; num-=4;
}
if (num)
{
mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
mul(rp[2],ap[2],w,c1);
}
return(c1);
}
@ -114,28 +115,19 @@ void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
{
bn_check_num(n);
if (n <= 0) return;
for (;;)
while (n&~3)
{
BN_ULLONG t;
t=(BN_ULLONG)(a[0])*(a[0]);
r[0]=Lw(t); r[1]=Hw(t);
if (--n == 0) break;
t=(BN_ULLONG)(a[1])*(a[1]);
r[2]=Lw(t); r[3]=Hw(t);
if (--n == 0) break;
t=(BN_ULLONG)(a[2])*(a[2]);
r[4]=Lw(t); r[5]=Hw(t);
if (--n == 0) break;
t=(BN_ULLONG)(a[3])*(a[3]);
r[6]=Lw(t); r[7]=Hw(t);
if (--n == 0) break;
a+=4;
r+=8;
sqr(r[0],r[1],a[0]);
sqr(r[2],r[3],a[1]);
sqr(r[4],r[5],a[2]);
sqr(r[6],r[7],a[3]);
a+=4; r+=8; n-=4;
}
if (n)
{
sqr(r[0],r[1],a[0]); if (--n == 0) return;
sqr(r[2],r[3],a[1]); if (--n == 0) return;
sqr(r[4],r[5],a[2]);
}
}
@ -460,6 +452,38 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
#define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
#elif defined(BN_UMULT_HIGH)
#define mul_add_c(a,b,c0,c1,c2) { \
BN_ULONG ta=(a),tb=(b); \
t1 = ta * tb; \
t2 = BN_UMULT_HIGH(ta,tb); \
c0 += t1; t2 += (c0<t1)?1:0; \
c1 += t2; c2 += (c1<t2)?1:0; \
}
#define mul_add_c2(a,b,c0,c1,c2) { \
BN_ULONG ta=(a),tb=(b),t0; \
t1 = BN_UMULT_HIGH(ta,tb); \
t0 = ta * tb; \
t2 = t1+t1; c2 += (t2<t1)?1:0; \
t1 = t0+t0; t2 += (t1<t0)?1:0; \
c0 += t1; t2 += (c0<t1)?1:0; \
c1 += t2; c2 += (c1<t2)?1:0; \
}
#define sqr_add_c(a,i,c0,c1,c2) { \
BN_ULONG ta=(a)[i]; \
t1 = ta * ta; \
t2 = BN_UMULT_HIGH(ta,ta); \
c0 += t1; t2 += (c0<t1)?1:0; \
c1 += t2; c2 += (c1<t2)?1:0; \
}
#define sqr_add_c2(a,i,j,c0,c1,c2) \
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
#else
#define mul_add_c(a,b,c0,c1,c2) \
t1=LBITS(a); t2=HBITS(a); \

View file

@ -280,9 +280,14 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
*/
rem=(n1-q*d0)&BN_MASK2;
#endif
#ifdef BN_UMULT_HIGH
t2l = d1 * q;
t2h = BN_UMULT_HIGH(d1,q);
#else
t2l=LBITS(d1); t2h=HBITS(d1);
ql =LBITS(q); qh =HBITS(q);
mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
#endif
for (;;)
{

View file

@ -86,6 +86,54 @@ extern "C" {
#endif
#endif
#if !defined(NO_ASM) && !defined(PEDANTIC)
/*
* BN_UMULT_HIGH section.
*
* No, I'm not trying to overwhelm you when stating that the
* product of N-bit numbers is 2*N bits wide:-) No, I don't expect
* you to be impressed when I say that if the compiler doesn't
* support 2*N integer type, then you have to replace every N*N
* multiplication with 4 (N/2)*(N/2) accompanied by some shifts
* and additions which unavoidably results in severe performance
* penalties. Of course provided that the hardware is capable of
* producing 2*N result... That's when you normally start
* considering assembler implementation. However! It should be
* pointed out that some CPUs (most notably Alpha, PowerPC and
* upcoming IA-64 family:-) provide *separate* instruction
* calculating the upper half of the product placing the result
* into a general purpose register. Now *if* the compiler supports
* inline assembler, then it's not impossible to implement the
* "bignum" routines (and have the compiler optimize 'em)
* exhibiting "native" performance in C. That's what BN_UMULT_HIGH
* macro is about:-)
*
* <appro@fy.chalmers.se>
*/
# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
# if defined(__DECC)
# include <c_asm.h>
# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
# elif defined(__GNUC__)
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("umulh %1,%2,%0" \
: "=r"(ret) \
: "r"(a), "r"(b)); \
ret; })
# endif /* compiler */
# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
# if defined(__GNUC__)
# define BN_UMULT_HIGH(a,b) ({ \
register BN_ULONG ret; \
asm ("mulhdu %0,%1,%2" \
: "=r"(ret) \
: "r"(a), "r"(b)); \
ret; })
# endif /* compiler */
# endif /* cpu */
#endif /* NO_ASM */
/*************************************************************
* Using the long long type
*/
@ -151,6 +199,43 @@ extern "C" {
(c)= Hw(t); \
}
#define sqr(r0,r1,a) { \
BN_ULLONG t; \
t=(BN_ULLONG)(a)*(a); \
(r0)=Lw(t); \
(r1)=Hw(t); ]
}
#elif defined(BN_UMULT_HIGH)
#define mul_add(r,a,w,c) { \
BN_ULONG high,low,ret,tmp=(a); \
ret = (r); \
high= BN_UMULT_HIGH(w,tmp); \
ret += (c); \
low = (w) * tmp; \
(c) = (ret<(c))?1:0; \
(c) += high; \
ret += low; \
(c) += (ret<low)?1:0; \
(r) = ret; \
}
#define mul(r,a,w,c) { \
BN_ULONG high,low,ret,ta=(a); \
low = (w) * ta; \
high= BN_UMULT_HIGH(w,ta); \
ret = low + (c); \
(c) = high; \
(c) += (ret<low)?1:0; \
(r) = ret; \
}
#define sqr(r0,r1,a) { \
BN_ULONG tmp=(a); \
(r0) = tmp * tmp; \
(r1) = BN_UMULT_HIGH(tmp,tmp); \
}
#else
/*************************************************************
* No long long type