ab8a4e54db
definitions) so we can use it in EVP GCM code avoiding need to allocate it.
133 lines
4.7 KiB
C
133 lines
4.7 KiB
C
/* ====================================================================
|
|
* Copyright (c) 2010 The OpenSSL Project. All rights reserved.
|
|
*
|
|
* Redistribution and use is governed by OpenSSL license.
|
|
* ====================================================================
|
|
*/
|
|
|
|
#include <openssl/modes.h>
|
|
|
|
|
|
#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
|
|
typedef __int64 i64;
|
|
typedef unsigned __int64 u64;
|
|
#define U64(C) C##UI64
|
|
#elif defined(__arch64__)
|
|
typedef long i64;
|
|
typedef unsigned long u64;
|
|
#define U64(C) C##UL
|
|
#else
|
|
typedef long long i64;
|
|
typedef unsigned long long u64;
|
|
#define U64(C) C##ULL
|
|
#endif
|
|
|
|
typedef unsigned int u32;
|
|
typedef unsigned char u8;
|
|
|
|
#define STRICT_ALIGNMENT 1
|
|
#if defined(__i386) || defined(__i386__) || \
|
|
defined(__x86_64) || defined(__x86_64__) || \
|
|
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
|
|
defined(__s390__) || defined(__s390x__)
|
|
# undef STRICT_ALIGNMENT
|
|
#endif
|
|
|
|
#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPNESSL_NO_INLINE_ASM)
|
|
#if defined(__GNUC__) && __GNUC__>=2
|
|
# if defined(__x86_64) || defined(__x86_64__)
|
|
# define BSWAP8(x) ({ u64 ret=(x); \
|
|
asm volatile ("bswapq %0" \
|
|
: "+r"(ret)); ret; })
|
|
# define BSWAP4(x) ({ u32 ret=(x); \
|
|
asm volatile ("bswapl %0" \
|
|
: "+r"(ret)); ret; })
|
|
# elif (defined(__i386) || defined(__i386__))
|
|
# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
|
|
asm volatile ("bswapl %0; bswapl %1" \
|
|
: "+r"(hi),"+r"(lo)); \
|
|
(u64)hi<<32|lo; })
|
|
# define BSWAP4(x) ({ u32 ret=(x); \
|
|
asm volatile ("bswapl %0" \
|
|
: "+r"(ret)); ret; })
|
|
# endif
|
|
#elif defined(_MSC_VER)
|
|
# if _MSC_VER>=1300
|
|
# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
|
|
# define BSWAP8(x) _byteswap_uint64((u64)(x))
|
|
# define BSWAP4(x) _byteswap_ulong((u32)(x))
|
|
# elif defined(_M_IX86)
|
|
__inline u32 _bswap4(u32 val) {
|
|
_asm mov eax,val
|
|
_asm bswap eax
|
|
}
|
|
# define BSWAP4(x) _bswap4(x)
|
|
# endif
|
|
#endif
|
|
#endif
|
|
|
|
#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
|
|
#define GETU32(p) BSWAP4(*(const u32 *)(p))
|
|
#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
|
|
#else
|
|
#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
|
|
#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
|
|
#endif
|
|
|
|
/* GCM definitions */
|
|
|
|
typedef struct { u64 hi,lo; } u128;
|
|
|
|
#ifdef TABLE_BITS
|
|
#undef TABLE_BITS
|
|
#endif
|
|
/*
|
|
* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
|
|
* never be set to 8. 8 is effectively reserved for testing purposes.
|
|
* TABLE_BITS>1 are lookup-table-driven implementations referred to as
|
|
* "Shoup's" in GCM specification. In other words OpenSSL does not cover
|
|
* whole spectrum of possible table driven implementations. Why? In
|
|
* non-"Shoup's" case memory access pattern is segmented in such manner,
|
|
* that it's trivial to see that cache timing information can reveal
|
|
* fair portion of intermediate hash value. Given that ciphertext is
|
|
* always available to attacker, it's possible for him to attempt to
|
|
* deduce secret parameter H and if successful, tamper with messages
|
|
* [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
|
|
* not as trivial, but there is no reason to believe that it's resistant
|
|
* to cache-timing attack. And the thing about "8-bit" implementation is
|
|
* that it consumes 16 (sixteen) times more memory, 4KB per individual
|
|
* key + 1KB shared. Well, on pros side it should be twice as fast as
|
|
* "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
|
|
* was observed to run ~75% faster, closer to 100% for commercial
|
|
* compilers... Yet "4-bit" procedure is preferred, because it's
|
|
* believed to provide better security-performance balance and adequate
|
|
* all-round performance. "All-round" refers to things like:
|
|
*
|
|
* - shorter setup time effectively improves overall timing for
|
|
* handling short messages;
|
|
* - larger table allocation can become unbearable because of VM
|
|
* subsystem penalties (for example on Windows large enough free
|
|
* results in VM working set trimming, meaning that consequent
|
|
* malloc would immediately incur working set expansion);
|
|
* - larger table has larger cache footprint, which can affect
|
|
* performance of other code paths (not necessarily even from same
|
|
* thread in Hyper-Threading world);
|
|
*/
|
|
#define TABLE_BITS 4
|
|
|
|
struct gcm128_context {
|
|
/* Following 6 names follow names in GCM specification */
|
|
union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
|
|
Xi,H,len;
|
|
/* Pre-computed table used by gcm_gmult_* */
|
|
#if TABLE_BITS==8
|
|
u128 Htable[256];
|
|
#else
|
|
u128 Htable[16];
|
|
void (*gmult)(u64 Xi[2],const u128 Htable[16]);
|
|
void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
|
#endif
|
|
unsigned int mres, ares;
|
|
block128_f block;
|
|
void *key;
|
|
};
|