ec/curve25519.c: facilitate assembly implementations.
Currently it's limited to 64-bit platforms only as minimum radix expected in assembly is 2^51. Reviewed-by: Rich Salz <rsalz@openssl.org> Reviewed-by: Paul Dale <paul.dale@oracle.com> (Merged from https://github.com/openssl/openssl/pull/5408)
This commit is contained in:
parent
42efffcb70
commit
c521e4392f
1 changed files with 287 additions and 32 deletions
|
@ -11,16 +11,19 @@
|
|||
#include "ec_lcl.h"
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#if !defined(PEDANTIC) && \
|
||||
!defined(__sparc__) && \
|
||||
(defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16)
|
||||
#if defined(X25519_ASM) \
|
||||
|| ( !defined(PEDANTIC) && \
|
||||
!defined(__sparc__) && \
|
||||
(defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16) )
|
||||
/*
|
||||
* Base 2^51 implementation.
|
||||
*/
|
||||
# define BASE_2_51_IMPLEMENTED
|
||||
|
||||
typedef uint64_t fe51[5];
|
||||
# if !defined(X25519_ASM)
|
||||
typedef unsigned __int128 u128;
|
||||
# endif
|
||||
|
||||
static const uint64_t MASK51 = 0x7ffffffffffff;
|
||||
|
||||
|
@ -132,6 +135,250 @@ static void fe51_tobytes(uint8_t *s, const fe51 h)
|
|||
s[31] = h4 >> 44;
|
||||
}
|
||||
|
||||
# ifdef X25519_ASM
|
||||
void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g);
|
||||
void x25519_fe51_sqr(fe51 h, const fe51 f);
|
||||
void x25519_fe51_mul121666(fe51 h, fe51 f);
|
||||
# define fe51_mul x25519_fe51_mul
|
||||
# define fe51_sq x25519_fe51_sqr
|
||||
# define fe51_mul121666 x25519_fe51_mul121666
|
||||
|
||||
# if defined(__x86_64) || defined(__x86_64__) || \
|
||||
defined(_M_AMD64) || defined(_M_X64)
|
||||
|
||||
# define BASE_2_64_IMPLEMENTED
|
||||
|
||||
typedef uint64_t fe64[4];
|
||||
|
||||
int x25519_fe64_eligible();
|
||||
|
||||
/*
|
||||
* There are no reference C implementations for this radix.
|
||||
*/
|
||||
void x25519_fe64_mul(fe64 h, const fe64 f, const fe64 g);
|
||||
void x25519_fe64_sqr(fe64 h, const fe64 f);
|
||||
void x25519_fe64_mul121666(fe64 h, fe64 f);
|
||||
void x25519_fe64_add(fe64 h, const fe64 f, const fe64 g);
|
||||
void x25519_fe64_sub(fe64 h, const fe64 f, const fe64 g);
|
||||
void x25519_fe64_tobytes(uint8_t *s, const fe64 f);
|
||||
# define fe64_mul x25519_fe64_mul
|
||||
# define fe64_sqr x25519_fe64_sqr
|
||||
# define fe64_mul121666 x25519_fe64_mul121666
|
||||
# define fe64_add x25519_fe64_add
|
||||
# define fe64_sub x25519_fe64_sub
|
||||
# define fe64_tobytes x25519_fe64_tobytes
|
||||
|
||||
static uint64_t load_8(const uint8_t *in)
|
||||
{
|
||||
uint64_t result;
|
||||
|
||||
result = in[0];
|
||||
result |= ((uint64_t)in[1]) << 8;
|
||||
result |= ((uint64_t)in[2]) << 16;
|
||||
result |= ((uint64_t)in[3]) << 24;
|
||||
result |= ((uint64_t)in[4]) << 32;
|
||||
result |= ((uint64_t)in[5]) << 40;
|
||||
result |= ((uint64_t)in[6]) << 48;
|
||||
result |= ((uint64_t)in[7]) << 56;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static void fe64_frombytes(fe64 h, const uint8_t *s)
|
||||
{
|
||||
h[0] = load_8(s);
|
||||
h[1] = load_8(s + 8);
|
||||
h[2] = load_8(s + 16);
|
||||
h[3] = load_8(s + 24) & 0x7fffffffffffffff;
|
||||
}
|
||||
|
||||
static void fe64_0(fe64 h)
|
||||
{
|
||||
h[0] = 0;
|
||||
h[1] = 0;
|
||||
h[2] = 0;
|
||||
h[3] = 0;
|
||||
}
|
||||
|
||||
static void fe64_1(fe64 h)
|
||||
{
|
||||
h[0] = 1;
|
||||
h[1] = 0;
|
||||
h[2] = 0;
|
||||
h[3] = 0;
|
||||
}
|
||||
|
||||
static void fe64_copy(fe64 h, const fe64 f)
|
||||
{
|
||||
h[0] = f[0];
|
||||
h[1] = f[1];
|
||||
h[2] = f[2];
|
||||
h[3] = f[3];
|
||||
}
|
||||
|
||||
static void fe64_cswap(fe64 f, fe64 g, unsigned int b)
|
||||
{
|
||||
int i;
|
||||
uint64_t mask = 0 - (uint64_t)b;
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
uint64_t x = f[i] ^ g[i];
|
||||
x &= mask;
|
||||
f[i] ^= x;
|
||||
g[i] ^= x;
|
||||
}
|
||||
}
|
||||
|
||||
static void fe64_invert(fe64 out, const fe64 z)
|
||||
{
|
||||
fe64 t0;
|
||||
fe64 t1;
|
||||
fe64 t2;
|
||||
fe64 t3;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Compute z ** -1 = z ** (2 ** 255 - 19 - 2) with the exponent as
|
||||
* 2 ** 255 - 21 = (2 ** 5) * (2 ** 250 - 1) + 11.
|
||||
*/
|
||||
|
||||
/* t0 = z ** 2 */
|
||||
fe64_sqr(t0, z);
|
||||
|
||||
/* t1 = t0 ** (2 ** 2) = z ** 8 */
|
||||
fe64_sqr(t1, t0);
|
||||
fe64_sqr(t1, t1);
|
||||
|
||||
/* t1 = z * t1 = z ** 9 */
|
||||
fe64_mul(t1, z, t1);
|
||||
/* t0 = t0 * t1 = z ** 11 -- stash t0 away for the end. */
|
||||
fe64_mul(t0, t0, t1);
|
||||
|
||||
/* t2 = t0 ** 2 = z ** 22 */
|
||||
fe64_sqr(t2, t0);
|
||||
|
||||
/* t1 = t1 * t2 = z ** (2 ** 5 - 1) */
|
||||
fe64_mul(t1, t1, t2);
|
||||
|
||||
/* t2 = t1 ** (2 ** 5) = z ** ((2 ** 5) * (2 ** 5 - 1)) */
|
||||
fe64_sqr(t2, t1);
|
||||
for (i = 1; i < 5; ++i)
|
||||
fe64_sqr(t2, t2);
|
||||
|
||||
/* t1 = t1 * t2 = z ** ((2 ** 5 + 1) * (2 ** 5 - 1)) = z ** (2 ** 10 - 1) */
|
||||
fe64_mul(t1, t2, t1);
|
||||
|
||||
/* Continuing similarly... */
|
||||
|
||||
/* t2 = z ** (2 ** 20 - 1) */
|
||||
fe64_sqr(t2, t1);
|
||||
for (i = 1; i < 10; ++i)
|
||||
fe64_sqr(t2, t2);
|
||||
|
||||
fe64_mul(t2, t2, t1);
|
||||
|
||||
/* t2 = z ** (2 ** 40 - 1) */
|
||||
fe64_sqr(t3, t2);
|
||||
for (i = 1; i < 20; ++i)
|
||||
fe64_sqr(t3, t3);
|
||||
|
||||
fe64_mul(t2, t3, t2);
|
||||
|
||||
/* t2 = z ** (2 ** 10) * (2 ** 40 - 1) */
|
||||
for (i = 0; i < 10; ++i)
|
||||
fe64_sqr(t2, t2);
|
||||
|
||||
/* t1 = z ** (2 ** 50 - 1) */
|
||||
fe64_mul(t1, t2, t1);
|
||||
|
||||
/* t2 = z ** (2 ** 100 - 1) */
|
||||
fe64_sqr(t2, t1);
|
||||
for (i = 1; i < 50; ++i)
|
||||
fe64_sqr(t2, t2);
|
||||
|
||||
fe64_mul(t2, t2, t1);
|
||||
|
||||
/* t2 = z ** (2 ** 200 - 1) */
|
||||
fe64_sqr(t3, t2);
|
||||
for (i = 1; i < 100; ++i)
|
||||
fe64_sqr(t3, t3);
|
||||
|
||||
fe64_mul(t2, t3, t2);
|
||||
|
||||
/* t2 = z ** ((2 ** 50) * (2 ** 200 - 1) */
|
||||
for (i = 0; i < 50; ++i)
|
||||
fe64_sqr(t2, t2);
|
||||
|
||||
/* t1 = z ** (2 ** 250 - 1) */
|
||||
fe64_mul(t1, t2, t1);
|
||||
|
||||
/* t1 = z ** ((2 ** 5) * (2 ** 250 - 1)) */
|
||||
for (i = 0; i < 5; ++i)
|
||||
fe64_sqr(t1, t1);
|
||||
|
||||
/* Recall t0 = z ** 11; out = z ** (2 ** 255 - 21) */
|
||||
fe64_mul(out, t1, t0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Duplicate of original x25519_scalar_mult_generic, but using
|
||||
* fe64_* subroutines.
|
||||
*/
|
||||
static void x25519_scalar_mulx(uint8_t out[32], const uint8_t scalar[32],
|
||||
const uint8_t point[32])
|
||||
{
|
||||
fe64 x1, x2, z2, x3, z3, tmp0, tmp1;
|
||||
uint8_t e[32];
|
||||
unsigned swap = 0;
|
||||
int pos;
|
||||
|
||||
memcpy(e, scalar, 32);
|
||||
e[0] &= 0xf8;
|
||||
e[31] &= 0x7f;
|
||||
e[31] |= 0x40;
|
||||
fe64_frombytes(x1, point);
|
||||
fe64_1(x2);
|
||||
fe64_0(z2);
|
||||
fe64_copy(x3, x1);
|
||||
fe64_1(z3);
|
||||
|
||||
for (pos = 254; pos >= 0; --pos) {
|
||||
unsigned int b = 1 & (e[pos / 8] >> (pos & 7));
|
||||
|
||||
swap ^= b;
|
||||
fe64_cswap(x2, x3, swap);
|
||||
fe64_cswap(z2, z3, swap);
|
||||
swap = b;
|
||||
fe64_sub(tmp0, x3, z3);
|
||||
fe64_sub(tmp1, x2, z2);
|
||||
fe64_add(x2, x2, z2);
|
||||
fe64_add(z2, x3, z3);
|
||||
fe64_mul(z3, x2, tmp0);
|
||||
fe64_mul(z2, z2, tmp1);
|
||||
fe64_sqr(tmp0, tmp1);
|
||||
fe64_sqr(tmp1, x2);
|
||||
fe64_add(x3, z3, z2);
|
||||
fe64_sub(z2, z3, z2);
|
||||
fe64_mul(x2, tmp1, tmp0);
|
||||
fe64_sub(tmp1, tmp1, tmp0);
|
||||
fe64_sqr(z2, z2);
|
||||
fe64_mul121666(z3, tmp1);
|
||||
fe64_sqr(x3, x3);
|
||||
fe64_add(tmp0, tmp0, z3);
|
||||
fe64_mul(z3, x1, z2);
|
||||
fe64_mul(z2, tmp1, tmp0);
|
||||
}
|
||||
|
||||
fe64_invert(z2, z2);
|
||||
fe64_mul(x2, x2, z2);
|
||||
fe64_tobytes(out, x2);
|
||||
|
||||
OPENSSL_cleanse(e, sizeof(e));
|
||||
}
|
||||
# endif
|
||||
|
||||
# else
|
||||
|
||||
static void fe51_mul(fe51 h, const fe51 f, const fe51 g)
|
||||
{
|
||||
u128 h0, h1, h2, h3, h4;
|
||||
|
@ -192,9 +439,9 @@ static void fe51_mul(fe51 h, const fe51 f, const fe51 g)
|
|||
|
||||
static void fe51_sq(fe51 h, const fe51 f)
|
||||
{
|
||||
# if defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
# if defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
fe51_mul(h, f, f);
|
||||
# else
|
||||
# else
|
||||
/* dedicated squaring gives 16-25% overall improvement */
|
||||
uint64_t g0 = f[0];
|
||||
uint64_t g1 = f[1];
|
||||
|
@ -241,9 +488,36 @@ static void fe51_sq(fe51 h, const fe51 f)
|
|||
h[2] = g2;
|
||||
h[3] = g3;
|
||||
h[4] = g4;
|
||||
# endif
|
||||
# endif
|
||||
}
|
||||
|
||||
static void fe51_mul121666(fe51 h, fe51 f)
|
||||
{
|
||||
u128 h0 = f[0] * (u128)121666;
|
||||
u128 h1 = f[1] * (u128)121666;
|
||||
u128 h2 = f[2] * (u128)121666;
|
||||
u128 h3 = f[3] * (u128)121666;
|
||||
u128 h4 = f[4] * (u128)121666;
|
||||
uint64_t g0, g1, g2, g3, g4;
|
||||
|
||||
h3 += (uint64_t)(h2 >> 51); g2 = (uint64_t)h2 & MASK51;
|
||||
h1 += (uint64_t)(h0 >> 51); g0 = (uint64_t)h0 & MASK51;
|
||||
|
||||
h4 += (uint64_t)(h3 >> 51); g3 = (uint64_t)h3 & MASK51;
|
||||
g2 += (uint64_t)(h1 >> 51); g1 = (uint64_t)h1 & MASK51;
|
||||
|
||||
g0 += (uint64_t)(h4 >> 51) * 19; g4 = (uint64_t)h4 & MASK51;
|
||||
g3 += g2 >> 51; g2 &= MASK51;
|
||||
g1 += g0 >> 51; g0 &= MASK51;
|
||||
|
||||
h[0] = g0;
|
||||
h[1] = g1;
|
||||
h[2] = g2;
|
||||
h[3] = g3;
|
||||
h[4] = g4;
|
||||
}
|
||||
# endif
|
||||
|
||||
static void fe51_add(fe51 h, const fe51 f, const fe51 g)
|
||||
{
|
||||
h[0] = f[0] + g[0];
|
||||
|
@ -397,32 +671,6 @@ static void fe51_invert(fe51 out, const fe51 z)
|
|||
fe51_mul(out, t1, t0);
|
||||
}
|
||||
|
||||
static void fe51_mul121666(fe51 h, fe51 f)
|
||||
{
|
||||
u128 h0 = f[0] * (u128)121666;
|
||||
u128 h1 = f[1] * (u128)121666;
|
||||
u128 h2 = f[2] * (u128)121666;
|
||||
u128 h3 = f[3] * (u128)121666;
|
||||
u128 h4 = f[4] * (u128)121666;
|
||||
uint64_t g0, g1, g2, g3, g4;
|
||||
|
||||
h3 += (uint64_t)(h2 >> 51); g2 = (uint64_t)h2 & MASK51;
|
||||
h1 += (uint64_t)(h0 >> 51); g0 = (uint64_t)h0 & MASK51;
|
||||
|
||||
h4 += (uint64_t)(h3 >> 51); g3 = (uint64_t)h3 & MASK51;
|
||||
g2 += (uint64_t)(h1 >> 51); g1 = (uint64_t)h1 & MASK51;
|
||||
|
||||
g0 += (uint64_t)(h4 >> 51) * 19; g4 = (uint64_t)h4 & MASK51;
|
||||
g3 += g2 >> 51; g2 &= MASK51;
|
||||
g1 += g0 >> 51; g0 &= MASK51;
|
||||
|
||||
h[0] = g0;
|
||||
h[1] = g1;
|
||||
h[2] = g2;
|
||||
h[3] = g3;
|
||||
h[4] = g4;
|
||||
}
|
||||
|
||||
/*
|
||||
* Duplicate of original x25519_scalar_mult_generic, but using
|
||||
* fe51_* subroutines.
|
||||
|
@ -435,6 +683,13 @@ static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
|
|||
unsigned swap = 0;
|
||||
int pos;
|
||||
|
||||
# ifdef BASE_2_64_IMPLEMENTED
|
||||
if (x25519_fe64_eligible()) {
|
||||
x25519_scalar_mulx(out, scalar, point);
|
||||
return;
|
||||
}
|
||||
# endif
|
||||
|
||||
memcpy(e, scalar, 32);
|
||||
e[0] &= 0xf8;
|
||||
e[31] &= 0x7f;
|
||||
|
|
Loading…
Reference in a new issue