openssl/crypto/modes/modes_lcl.h
Dr. Stephen Henson ab8a4e54db Move gcm128_context definition to modes_lcl.h (along with some related
definitions) so we can use it in EVP GCM code avoiding need to allocate
it.
2011-02-19 22:16:52 +00:00

133 lines
4.7 KiB
C

/* ====================================================================
* Copyright (c) 2010 The OpenSSL Project. All rights reserved.
*
* Redistribution and use is governed by OpenSSL license.
* ====================================================================
*/
#include <openssl/modes.h>
#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
typedef __int64 i64;
typedef unsigned __int64 u64;
#define U64(C) C##UI64
#elif defined(__arch64__)
typedef long i64;
typedef unsigned long u64;
#define U64(C) C##UL
#else
typedef long long i64;
typedef unsigned long long u64;
#define U64(C) C##ULL
#endif
typedef unsigned int u32;
typedef unsigned char u8;
#define STRICT_ALIGNMENT 1
#if defined(__i386) || defined(__i386__) || \
defined(__x86_64) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
defined(__s390__) || defined(__s390x__)
# undef STRICT_ALIGNMENT
#endif
#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPNESSL_NO_INLINE_ASM)
#if defined(__GNUC__) && __GNUC__>=2
# if defined(__x86_64) || defined(__x86_64__)
# define BSWAP8(x) ({ u64 ret=(x); \
asm volatile ("bswapq %0" \
: "+r"(ret)); ret; })
# define BSWAP4(x) ({ u32 ret=(x); \
asm volatile ("bswapl %0" \
: "+r"(ret)); ret; })
# elif (defined(__i386) || defined(__i386__))
# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \
asm volatile ("bswapl %0; bswapl %1" \
: "+r"(hi),"+r"(lo)); \
(u64)hi<<32|lo; })
# define BSWAP4(x) ({ u32 ret=(x); \
asm volatile ("bswapl %0" \
: "+r"(ret)); ret; })
# endif
#elif defined(_MSC_VER)
# if _MSC_VER>=1300
# pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
# define BSWAP8(x) _byteswap_uint64((u64)(x))
# define BSWAP4(x) _byteswap_ulong((u32)(x))
# elif defined(_M_IX86)
__inline u32 _bswap4(u32 val) {
_asm mov eax,val
_asm bswap eax
}
# define BSWAP4(x) _bswap4(x)
# endif
#endif
#endif
#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
#define GETU32(p) BSWAP4(*(const u32 *)(p))
#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
#else
#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
#endif
/* GCM definitions */
typedef struct { u64 hi,lo; } u128;
#ifdef TABLE_BITS
#undef TABLE_BITS
#endif
/*
* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
* never be set to 8. 8 is effectively reserved for testing purposes.
* TABLE_BITS>1 are lookup-table-driven implementations referred to as
* "Shoup's" in GCM specification. In other words OpenSSL does not cover
* whole spectrum of possible table driven implementations. Why? In
* non-"Shoup's" case memory access pattern is segmented in such manner,
* that it's trivial to see that cache timing information can reveal
* fair portion of intermediate hash value. Given that ciphertext is
* always available to attacker, it's possible for him to attempt to
* deduce secret parameter H and if successful, tamper with messages
* [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
* not as trivial, but there is no reason to believe that it's resistant
* to cache-timing attack. And the thing about "8-bit" implementation is
* that it consumes 16 (sixteen) times more memory, 4KB per individual
* key + 1KB shared. Well, on pros side it should be twice as fast as
* "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
* was observed to run ~75% faster, closer to 100% for commercial
* compilers... Yet "4-bit" procedure is preferred, because it's
* believed to provide better security-performance balance and adequate
* all-round performance. "All-round" refers to things like:
*
* - shorter setup time effectively improves overall timing for
* handling short messages;
* - larger table allocation can become unbearable because of VM
* subsystem penalties (for example on Windows large enough free
* results in VM working set trimming, meaning that consequent
* malloc would immediately incur working set expansion);
* - larger table has larger cache footprint, which can affect
* performance of other code paths (not necessarily even from same
* thread in Hyper-Threading world);
*/
#define TABLE_BITS 4
struct gcm128_context {
/* Following 6 names follow names in GCM specification */
union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
Xi,H,len;
/* Pre-computed table used by gcm_gmult_* */
#if TABLE_BITS==8
u128 Htable[256];
#else
u128 Htable[16];
void (*gmult)(u64 Xi[2],const u128 Htable[16]);
void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
#endif
unsigned int mres, ares;
block128_f block;
void *key;
};