Support for "multiply high" instruction, see BN_UMULT_HIGH comment in
crypto/bn/bn_lcl.h for further details. It should be noted that for the moment of this writing the code was tested only on Alpha. If compiled with DEC C the C implementation exhibits 12% performance improvement over the crypto/bn/asm/alpha.s (on EV56 box running AlphaLinux). GNU C is (unfortunately) 8% behind the assembler implementation. But it's OpenVMS Alpha users who *may* benefit most as 'apps/openssl speed rsa' exhibits 6 (six) times performance improvement over the original VMS bignum implementation. Where "*may*" means "as soon as code is enabled though #define SIXTY_FOUR_BIT and crypto/bn/asm/vms.mar is skipped."
This commit is contained in:
parent
54a34aecc3
commit
fb81ac5e6b
3 changed files with 151 additions and 37 deletions
|
@ -60,7 +60,7 @@
|
|||
#include "cryptlib.h"
|
||||
#include "bn_lcl.h"
|
||||
|
||||
#ifdef BN_LLONG
|
||||
#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
|
||||
|
||||
BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
|
||||
{
|
||||
|
@ -69,18 +69,19 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
|
|||
bn_check_num(num);
|
||||
if (num <= 0) return(c1);
|
||||
|
||||
for (;;)
|
||||
while (num&~3)
|
||||
{
|
||||
mul_add(rp[0],ap[0],w,c1);
|
||||
if (--num == 0) break;
|
||||
mul_add(rp[1],ap[1],w,c1);
|
||||
if (--num == 0) break;
|
||||
mul_add(rp[2],ap[2],w,c1);
|
||||
if (--num == 0) break;
|
||||
mul_add(rp[3],ap[3],w,c1);
|
||||
if (--num == 0) break;
|
||||
ap+=4;
|
||||
rp+=4;
|
||||
ap+=4; rp+=4; num-=4;
|
||||
}
|
||||
if (num)
|
||||
{
|
||||
mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
|
||||
mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
|
||||
mul_add(rp[2],ap[2],w,c1); return c1;
|
||||
}
|
||||
|
||||
return(c1);
|
||||
|
@ -93,19 +94,19 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
|
|||
bn_check_num(num);
|
||||
if (num <= 0) return(c1);
|
||||
|
||||
/* for (;;) */
|
||||
while (1) /* circumvent egcs-1.1.2 bug */
|
||||
while (num&~3)
|
||||
{
|
||||
mul(rp[0],ap[0],w,c1);
|
||||
if (--num == 0) break;
|
||||
mul(rp[1],ap[1],w,c1);
|
||||
if (--num == 0) break;
|
||||
mul(rp[2],ap[2],w,c1);
|
||||
if (--num == 0) break;
|
||||
mul(rp[3],ap[3],w,c1);
|
||||
if (--num == 0) break;
|
||||
ap+=4;
|
||||
rp+=4;
|
||||
ap+=4; rp+=4; num-=4;
|
||||
}
|
||||
if (num)
|
||||
{
|
||||
mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
|
||||
mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
|
||||
mul(rp[2],ap[2],w,c1);
|
||||
}
|
||||
return(c1);
|
||||
}
|
||||
|
@ -114,28 +115,19 @@ void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
|
|||
{
|
||||
bn_check_num(n);
|
||||
if (n <= 0) return;
|
||||
for (;;)
|
||||
while (n&~3)
|
||||
{
|
||||
BN_ULLONG t;
|
||||
|
||||
t=(BN_ULLONG)(a[0])*(a[0]);
|
||||
r[0]=Lw(t); r[1]=Hw(t);
|
||||
if (--n == 0) break;
|
||||
|
||||
t=(BN_ULLONG)(a[1])*(a[1]);
|
||||
r[2]=Lw(t); r[3]=Hw(t);
|
||||
if (--n == 0) break;
|
||||
|
||||
t=(BN_ULLONG)(a[2])*(a[2]);
|
||||
r[4]=Lw(t); r[5]=Hw(t);
|
||||
if (--n == 0) break;
|
||||
|
||||
t=(BN_ULLONG)(a[3])*(a[3]);
|
||||
r[6]=Lw(t); r[7]=Hw(t);
|
||||
if (--n == 0) break;
|
||||
|
||||
a+=4;
|
||||
r+=8;
|
||||
sqr(r[0],r[1],a[0]);
|
||||
sqr(r[2],r[3],a[1]);
|
||||
sqr(r[4],r[5],a[2]);
|
||||
sqr(r[6],r[7],a[3]);
|
||||
a+=4; r+=8; n-=4;
|
||||
}
|
||||
if (n)
|
||||
{
|
||||
sqr(r[0],r[1],a[0]); if (--n == 0) return;
|
||||
sqr(r[2],r[3],a[1]); if (--n == 0) return;
|
||||
sqr(r[4],r[5],a[2]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -460,6 +452,38 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
|
|||
|
||||
#define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||
|
||||
#elif defined(BN_UMULT_HIGH)
|
||||
|
||||
#define mul_add_c(a,b,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a),tb=(b); \
|
||||
t1 = ta * tb; \
|
||||
t2 = BN_UMULT_HIGH(ta,tb); \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
|
||||
#define mul_add_c2(a,b,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a),tb=(b),t0; \
|
||||
t1 = BN_UMULT_HIGH(ta,tb); \
|
||||
t0 = ta * tb; \
|
||||
t2 = t1+t1; c2 += (t2<t1)?1:0; \
|
||||
t1 = t0+t0; t2 += (t1<t0)?1:0; \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
|
||||
#define sqr_add_c(a,i,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a)[i]; \
|
||||
t1 = ta * ta; \
|
||||
t2 = BN_UMULT_HIGH(ta,ta); \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
|
||||
#define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||
|
||||
#else
|
||||
#define mul_add_c(a,b,c0,c1,c2) \
|
||||
t1=LBITS(a); t2=HBITS(a); \
|
||||
|
|
|
@ -280,9 +280,14 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
|
|||
*/
|
||||
rem=(n1-q*d0)&BN_MASK2;
|
||||
#endif
|
||||
#ifdef BN_UMULT_HIGH
|
||||
t2l = d1 * q;
|
||||
t2h = BN_UMULT_HIGH(d1,q);
|
||||
#else
|
||||
t2l=LBITS(d1); t2h=HBITS(d1);
|
||||
ql =LBITS(q); qh =HBITS(q);
|
||||
mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
|
||||
#endif
|
||||
|
||||
for (;;)
|
||||
{
|
||||
|
|
|
@ -86,6 +86,54 @@ extern "C" {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(NO_ASM) && !defined(PEDANTIC)
|
||||
/*
|
||||
* BN_UMULT_HIGH section.
|
||||
*
|
||||
* No, I'm not trying to overwhelm you when stating that the
|
||||
* product of N-bit numbers is 2*N bits wide:-) No, I don't expect
|
||||
* you to be impressed when I say that if the compiler doesn't
|
||||
* support 2*N integer type, then you have to replace every N*N
|
||||
* multiplication with 4 (N/2)*(N/2) accompanied by some shifts
|
||||
* and additions which unavoidably results in severe performance
|
||||
* penalties. Of course provided that the hardware is capable of
|
||||
* producing 2*N result... That's when you normally start
|
||||
* considering assembler implementation. However! It should be
|
||||
* pointed out that some CPUs (most notably Alpha, PowerPC and
|
||||
* upcoming IA-64 family:-) provide *separate* instruction
|
||||
* calculating the upper half of the product placing the result
|
||||
* into a general purpose register. Now *if* the compiler supports
|
||||
* inline assembler, then it's not impossible to implement the
|
||||
* "bignum" routines (and have the compiler optimize 'em)
|
||||
* exhibiting "native" performance in C. That's what BN_UMULT_HIGH
|
||||
* macro is about:-)
|
||||
*
|
||||
* <appro@fy.chalmers.se>
|
||||
*/
|
||||
# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
|
||||
# if defined(__DECC)
|
||||
# include <c_asm.h>
|
||||
# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
|
||||
# elif defined(__GNUC__)
|
||||
# define BN_UMULT_HIGH(a,b) ({ \
|
||||
register BN_ULONG ret; \
|
||||
asm ("umulh %1,%2,%0" \
|
||||
: "=r"(ret) \
|
||||
: "r"(a), "r"(b)); \
|
||||
ret; })
|
||||
# endif /* compiler */
|
||||
# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
|
||||
# if defined(__GNUC__)
|
||||
# define BN_UMULT_HIGH(a,b) ({ \
|
||||
register BN_ULONG ret; \
|
||||
asm ("mulhdu %0,%1,%2" \
|
||||
: "=r"(ret) \
|
||||
: "r"(a), "r"(b)); \
|
||||
ret; })
|
||||
# endif /* compiler */
|
||||
# endif /* cpu */
|
||||
#endif /* NO_ASM */
|
||||
|
||||
/*************************************************************
|
||||
* Using the long long type
|
||||
*/
|
||||
|
@ -151,6 +199,43 @@ extern "C" {
|
|||
(c)= Hw(t); \
|
||||
}
|
||||
|
||||
#define sqr(r0,r1,a) { \
|
||||
BN_ULLONG t; \
|
||||
t=(BN_ULLONG)(a)*(a); \
|
||||
(r0)=Lw(t); \
|
||||
(r1)=Hw(t); ]
|
||||
}
|
||||
|
||||
#elif defined(BN_UMULT_HIGH)
|
||||
#define mul_add(r,a,w,c) { \
|
||||
BN_ULONG high,low,ret,tmp=(a); \
|
||||
ret = (r); \
|
||||
high= BN_UMULT_HIGH(w,tmp); \
|
||||
ret += (c); \
|
||||
low = (w) * tmp; \
|
||||
(c) = (ret<(c))?1:0; \
|
||||
(c) += high; \
|
||||
ret += low; \
|
||||
(c) += (ret<low)?1:0; \
|
||||
(r) = ret; \
|
||||
}
|
||||
|
||||
#define mul(r,a,w,c) { \
|
||||
BN_ULONG high,low,ret,ta=(a); \
|
||||
low = (w) * ta; \
|
||||
high= BN_UMULT_HIGH(w,ta); \
|
||||
ret = low + (c); \
|
||||
(c) = high; \
|
||||
(c) += (ret<low)?1:0; \
|
||||
(r) = ret; \
|
||||
}
|
||||
|
||||
#define sqr(r0,r1,a) { \
|
||||
BN_ULONG tmp=(a); \
|
||||
(r0) = tmp * tmp; \
|
||||
(r1) = BN_UMULT_HIGH(tmp,tmp); \
|
||||
}
|
||||
|
||||
#else
|
||||
/*************************************************************
|
||||
* No long long type
|
||||
|
|
Loading…
Reference in a new issue