From 3c849bc901fa191fc517bc20d905783e6e428de5 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 12 Jul 2018 11:53:16 +0200 Subject: [PATCH] ec/curve25519.c: reorganize for better accessibility. Move base 2^64 code to own #if section. It was nested in base 2^51 section, which arguably might have been tricky to follow. Reviewed-by: Rich Salz (Merged from https://github.com/openssl/openssl/pull/6699) --- crypto/ec/curve25519.c | 293 +++++++++++++++++++++-------------------- 1 file changed, 150 insertions(+), 143 deletions(-) diff --git a/crypto/ec/curve25519.c b/crypto/ec/curve25519.c index 9666de1201..abe9b9cbf6 100644 --- a/crypto/ec/curve25519.c +++ b/crypto/ec/curve25519.c @@ -11,149 +11,23 @@ #include "ec_lcl.h" #include -#if defined(X25519_ASM) \ - || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \ - && !defined(__sparc__) \ - && !(defined(__ANDROID__) && !defined(__clang__)) ) -/* - * Base 2^51 implementation. - */ -# define BASE_2_51_IMPLEMENTED +#if defined(X25519_ASM) && (defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64)) -typedef uint64_t fe51[5]; -# if !defined(X25519_ASM) -typedef __uint128_t u128; -# endif - -static const uint64_t MASK51 = 0x7ffffffffffff; - -static uint64_t load_7(const uint8_t *in) -{ - uint64_t result; - - result = in[0]; - result |= ((uint64_t)in[1]) << 8; - result |= ((uint64_t)in[2]) << 16; - result |= ((uint64_t)in[3]) << 24; - result |= ((uint64_t)in[4]) << 32; - result |= ((uint64_t)in[5]) << 40; - result |= ((uint64_t)in[6]) << 48; - - return result; -} - -static uint64_t load_6(const uint8_t *in) -{ - uint64_t result; - - result = in[0]; - result |= ((uint64_t)in[1]) << 8; - result |= ((uint64_t)in[2]) << 16; - result |= ((uint64_t)in[3]) << 24; - result |= ((uint64_t)in[4]) << 32; - result |= ((uint64_t)in[5]) << 40; - - return result; -} - -static void fe51_frombytes(fe51 h, const uint8_t *s) -{ - uint64_t h0 = load_7(s); /* 56 bits */ - uint64_t h1 = load_6(s + 7) << 5; /* 53 bits */ - uint64_t h2 = load_7(s + 13) << 2; /* 58 bits */ - uint64_t h3 = load_6(s + 20) << 7; /* 55 bits */ - uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4; /* 51 bits */ - - h1 |= h0 >> 51; h0 &= MASK51; - h2 |= h1 >> 51; h1 &= MASK51; - h3 |= h2 >> 51; h2 &= MASK51; - h4 |= h3 >> 51; h3 &= MASK51; - - h[0] = h0; - h[1] = h1; - h[2] = h2; - h[3] = h3; - h[4] = h4; -} - -static void fe51_tobytes(uint8_t *s, const fe51 h) -{ - uint64_t h0 = h[0]; - uint64_t h1 = h[1]; - uint64_t h2 = h[2]; - uint64_t h3 = h[3]; - uint64_t h4 = h[4]; - uint64_t q; - - /* compare to modulus */ - q = (h0 + 19) >> 51; - q = (h1 + q) >> 51; - q = (h2 + q) >> 51; - q = (h3 + q) >> 51; - q = (h4 + q) >> 51; - - /* full reduce */ - h0 += 19 * q; - h1 += h0 >> 51; h0 &= MASK51; - h2 += h1 >> 51; h1 &= MASK51; - h3 += h2 >> 51; h2 &= MASK51; - h4 += h3 >> 51; h3 &= MASK51; - h4 &= MASK51; - - /* smash */ - s[0] = (uint8_t)(h0 >> 0); - s[1] = (uint8_t)(h0 >> 8); - s[2] = (uint8_t)(h0 >> 16); - s[3] = (uint8_t)(h0 >> 24); - s[4] = (uint8_t)(h0 >> 32); - s[5] = (uint8_t)(h0 >> 40); - s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3)); - s[7] = (uint8_t)(h1 >> 5); - s[8] = (uint8_t)(h1 >> 13); - s[9] = (uint8_t)(h1 >> 21); - s[10] = (uint8_t)(h1 >> 29); - s[11] = (uint8_t)(h1 >> 37); - s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6)); - s[13] = (uint8_t)(h2 >> 2); - s[14] = (uint8_t)(h2 >> 10); - s[15] = (uint8_t)(h2 >> 18); - s[16] = (uint8_t)(h2 >> 26); - s[17] = (uint8_t)(h2 >> 34); - s[18] = (uint8_t)(h2 >> 42); - s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1)); - s[20] = (uint8_t)(h3 >> 7); - s[21] = (uint8_t)(h3 >> 15); - s[22] = (uint8_t)(h3 >> 23); - s[23] = (uint8_t)(h3 >> 31); - s[24] = (uint8_t)(h3 >> 39); - s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4)); - s[26] = (uint8_t)(h4 >> 4); - s[27] = (uint8_t)(h4 >> 12); - s[28] = (uint8_t)(h4 >> 20); - s[29] = (uint8_t)(h4 >> 28); - s[30] = (uint8_t)(h4 >> 36); - s[31] = (uint8_t)(h4 >> 44); -} - -# ifdef X25519_ASM -void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); -void x25519_fe51_sqr(fe51 h, const fe51 f); -void x25519_fe51_mul121666(fe51 h, fe51 f); -# define fe51_mul x25519_fe51_mul -# define fe51_sq x25519_fe51_sqr -# define fe51_mul121666 x25519_fe51_mul121666 - -# if defined(__x86_64) || defined(__x86_64__) || \ - defined(_M_AMD64) || defined(_M_X64) - -# define BASE_2_64_IMPLEMENTED +# define BASE_2_64_IMPLEMENTED typedef uint64_t fe64[4]; int x25519_fe64_eligible(void); /* - * There are no reference C implementations for this radix. + * Following subroutines perform corresponding operations modulo + * 2^256-38, i.e. double the curve modulus. However, inputs and + * outputs are permitted to be partially reduced, i.e. to remain + * in [0..2^256) range. It's all tied up in final fe64_tobytes + * that performs full reduction modulo 2^255-19. + * + * There are no reference C implementations for these. */ void x25519_fe64_mul(fe64 h, const fe64 f, const fe64 g); void x25519_fe64_sqr(fe64 h, const fe64 f); @@ -161,12 +35,12 @@ void x25519_fe64_mul121666(fe64 h, fe64 f); void x25519_fe64_add(fe64 h, const fe64 f, const fe64 g); void x25519_fe64_sub(fe64 h, const fe64 f, const fe64 g); void x25519_fe64_tobytes(uint8_t *s, const fe64 f); -# define fe64_mul x25519_fe64_mul -# define fe64_sqr x25519_fe64_sqr -# define fe64_mul121666 x25519_fe64_mul121666 -# define fe64_add x25519_fe64_add -# define fe64_sub x25519_fe64_sub -# define fe64_tobytes x25519_fe64_tobytes +# define fe64_mul x25519_fe64_mul +# define fe64_sqr x25519_fe64_sqr +# define fe64_mul121666 x25519_fe64_mul121666 +# define fe64_add x25519_fe64_add +# define fe64_sub x25519_fe64_sub +# define fe64_tobytes x25519_fe64_tobytes static uint64_t load_8(const uint8_t *in) { @@ -375,10 +249,143 @@ static void x25519_scalar_mulx(uint8_t out[32], const uint8_t scalar[32], OPENSSL_cleanse(e, sizeof(e)); } -# endif +#endif +#if defined(X25519_ASM) \ + || ( (defined(__SIZEOF_INT128__) && __SIZEOF_INT128__ == 16) \ + && !defined(__sparc__) \ + && !(defined(__ANDROID__) && !defined(__clang__)) ) +/* + * Base 2^51 implementation. It's virtually no different from reference + * base 2^25.5 implementation in respect to lax boundary conditions for + * intermediate values and even individual limbs. So that whatever you + * know about the reference, applies even here... + */ +# define BASE_2_51_IMPLEMENTED + +typedef uint64_t fe51[5]; + +static const uint64_t MASK51 = 0x7ffffffffffff; + +static uint64_t load_7(const uint8_t *in) +{ + uint64_t result; + + result = in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + result |= ((uint64_t)in[4]) << 32; + result |= ((uint64_t)in[5]) << 40; + result |= ((uint64_t)in[6]) << 48; + + return result; +} + +static uint64_t load_6(const uint8_t *in) +{ + uint64_t result; + + result = in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + result |= ((uint64_t)in[4]) << 32; + result |= ((uint64_t)in[5]) << 40; + + return result; +} + +static void fe51_frombytes(fe51 h, const uint8_t *s) +{ + uint64_t h0 = load_7(s); /* 56 bits */ + uint64_t h1 = load_6(s + 7) << 5; /* 53 bits */ + uint64_t h2 = load_7(s + 13) << 2; /* 58 bits */ + uint64_t h3 = load_6(s + 20) << 7; /* 55 bits */ + uint64_t h4 = (load_6(s + 26) & 0x7fffffffffff) << 4; /* 51 bits */ + + h1 |= h0 >> 51; h0 &= MASK51; + h2 |= h1 >> 51; h1 &= MASK51; + h3 |= h2 >> 51; h2 &= MASK51; + h4 |= h3 >> 51; h3 &= MASK51; + + h[0] = h0; + h[1] = h1; + h[2] = h2; + h[3] = h3; + h[4] = h4; +} + +static void fe51_tobytes(uint8_t *s, const fe51 h) +{ + uint64_t h0 = h[0]; + uint64_t h1 = h[1]; + uint64_t h2 = h[2]; + uint64_t h3 = h[3]; + uint64_t h4 = h[4]; + uint64_t q; + + /* compare to modulus */ + q = (h0 + 19) >> 51; + q = (h1 + q) >> 51; + q = (h2 + q) >> 51; + q = (h3 + q) >> 51; + q = (h4 + q) >> 51; + + /* full reduce */ + h0 += 19 * q; + h1 += h0 >> 51; h0 &= MASK51; + h2 += h1 >> 51; h1 &= MASK51; + h3 += h2 >> 51; h2 &= MASK51; + h4 += h3 >> 51; h3 &= MASK51; + h4 &= MASK51; + + /* smash */ + s[0] = (uint8_t)(h0 >> 0); + s[1] = (uint8_t)(h0 >> 8); + s[2] = (uint8_t)(h0 >> 16); + s[3] = (uint8_t)(h0 >> 24); + s[4] = (uint8_t)(h0 >> 32); + s[5] = (uint8_t)(h0 >> 40); + s[6] = (uint8_t)((h0 >> 48) | ((uint32_t)h1 << 3)); + s[7] = (uint8_t)(h1 >> 5); + s[8] = (uint8_t)(h1 >> 13); + s[9] = (uint8_t)(h1 >> 21); + s[10] = (uint8_t)(h1 >> 29); + s[11] = (uint8_t)(h1 >> 37); + s[12] = (uint8_t)((h1 >> 45) | ((uint32_t)h2 << 6)); + s[13] = (uint8_t)(h2 >> 2); + s[14] = (uint8_t)(h2 >> 10); + s[15] = (uint8_t)(h2 >> 18); + s[16] = (uint8_t)(h2 >> 26); + s[17] = (uint8_t)(h2 >> 34); + s[18] = (uint8_t)(h2 >> 42); + s[19] = (uint8_t)((h2 >> 50) | ((uint32_t)h3 << 1)); + s[20] = (uint8_t)(h3 >> 7); + s[21] = (uint8_t)(h3 >> 15); + s[22] = (uint8_t)(h3 >> 23); + s[23] = (uint8_t)(h3 >> 31); + s[24] = (uint8_t)(h3 >> 39); + s[25] = (uint8_t)((h3 >> 47) | ((uint32_t)h4 << 4)); + s[26] = (uint8_t)(h4 >> 4); + s[27] = (uint8_t)(h4 >> 12); + s[28] = (uint8_t)(h4 >> 20); + s[29] = (uint8_t)(h4 >> 28); + s[30] = (uint8_t)(h4 >> 36); + s[31] = (uint8_t)(h4 >> 44); +} + +# if defined(X25519_ASM) +void x25519_fe51_mul(fe51 h, const fe51 f, const fe51 g); +void x25519_fe51_sqr(fe51 h, const fe51 f); +void x25519_fe51_mul121666(fe51 h, fe51 f); +# define fe51_mul x25519_fe51_mul +# define fe51_sq x25519_fe51_sqr +# define fe51_mul121666 x25519_fe51_mul121666 # else +typedef __uint128_t u128; + static void fe51_mul(fe51 h, const fe51 f, const fe51 g) { u128 h0, h1, h2, h3, h4;