3-4 times better RSA/DSA performance on WIN64A target. Well, on AMD64 CPU,
EMT64T will hardly exhibit better performance...
This commit is contained in:
parent
19bd66fe74
commit
11de71b04c
2 changed files with 64 additions and 0 deletions
|
@ -459,6 +459,34 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
|
|||
#define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||
|
||||
#elif defined(BN_UMULT_LOHI)
|
||||
|
||||
#define mul_add_c(a,b,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a),tb=(b); \
|
||||
BN_UMULT_LOHI(t1,t2,ta,tb); \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
|
||||
#define mul_add_c2(a,b,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a),tb=(b),t0; \
|
||||
BN_UMULT_LOHI(t0,t1,ta,tb); \
|
||||
t2 = t1+t1; c2 += (t2<t1)?1:0; \
|
||||
t1 = t0+t0; t2 += (t1<t0)?1:0; \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
|
||||
#define sqr_add_c(a,i,c0,c1,c2) { \
|
||||
BN_ULONG ta=(a)[i]; \
|
||||
BN_UMULT_LOHI(t1,t2,ta,ta); \
|
||||
c0 += t1; t2 += (c0<t1)?1:0; \
|
||||
c1 += t2; c2 += (c1<t2)?1:0; \
|
||||
}
|
||||
|
||||
#define sqr_add_c2(a,i,j,c0,c1,c2) \
|
||||
mul_add_c2((a)[i],(a)[j],c0,c1,c2)
|
||||
|
||||
#elif defined(BN_UMULT_HIGH)
|
||||
|
||||
#define mul_add_c(a,b,c0,c1,c2) { \
|
||||
|
|
|
@ -270,6 +270,15 @@ extern "C" {
|
|||
: "a"(a),"g"(b) \
|
||||
: "cc");
|
||||
# endif
|
||||
# elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
|
||||
# if defined(_MSC_VER) && _MSC_VER>=1400
|
||||
unsigned __int64 __umulh (unsigned __int64 a,unsigned __int64 b);
|
||||
unsigned __int64 _umul128 (unsigned __int64 a,unsigned __int64 b,
|
||||
unsigned __int64 *h);
|
||||
# pragma intrinsic(__umulh,_umul128)
|
||||
# define BN_UMULT_HIGH(a,b) __umulh((a),(b))
|
||||
# define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
|
||||
# endif
|
||||
# endif /* cpu */
|
||||
#endif /* OPENSSL_NO_ASM */
|
||||
|
||||
|
@ -313,6 +322,33 @@ extern "C" {
|
|||
(r1)=Hw(t); \
|
||||
}
|
||||
|
||||
#elif defined(BN_UMULT_LOHI)
|
||||
#define mul_add(r,a,w,c) { \
|
||||
BN_ULONG high,low,ret,tmp=(a); \
|
||||
ret = (r); \
|
||||
BN_UMULT_LOHI(low,high,w,tmp); \
|
||||
ret += (c); \
|
||||
(c) = (ret<(c))?1:0; \
|
||||
(c) += high; \
|
||||
ret += low; \
|
||||
(c) += (ret<low)?1:0; \
|
||||
(r) = ret; \
|
||||
}
|
||||
|
||||
#define mul(r,a,w,c) { \
|
||||
BN_ULONG high,low,ret,ta=(a); \
|
||||
BN_UMULT_LOHI(low,high,w,ta); \
|
||||
ret = low + (c); \
|
||||
(c) = high; \
|
||||
(c) += (ret<low)?1:0; \
|
||||
(r) = ret; \
|
||||
}
|
||||
|
||||
#define sqr(r0,r1,a) { \
|
||||
BN_ULONG tmp=(a); \
|
||||
BN_UMULT_LOHI(r0,r1,tmp,tmp); \
|
||||
}
|
||||
|
||||
#elif defined(BN_UMULT_HIGH)
|
||||
#define mul_add(r,a,w,c) { \
|
||||
BN_ULONG high,low,ret,tmp=(a); \
|
||||
|
|
Loading…
Reference in a new issue