diff --git a/crypto/ec/curve448/arch_32/f_impl.c b/crypto/ec/curve448/arch_32/f_impl.c index 8ba7c630e7..423e8a803a 100644 --- a/crypto/ec/curve448/arch_32/f_impl.c +++ b/crypto/ec/curve448/arch_32/f_impl.c @@ -12,14 +12,6 @@ #include "field.h" -#if (defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) && !defined(I_HATE_UNROLLED_LOOPS)) \ - || defined(C448_FORCE_UNROLL) -# define REPEAT8(_x) _x _x _x _x _x _x _x _x -# define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0) -#else -# define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0) -#endif - void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) { const uint32_t *a = as->limb, *b = bs->limb; @@ -34,30 +26,28 @@ void gf_mul(gf_s * RESTRICT cs, const gf as, const gf bs) bb[i] = b[i] + b[i + 8]; } - FOR_LIMB(j, 0, 8, { - accum2 = 0; - FOR_LIMB(i, 0, j + 1, { - accum2 += widemul(a[j - i], b[i]); - accum1 += widemul(aa[j - i], bb[i]); - accum0 += widemul(a[8 + j - i], b[8 + i]); - } - ); accum1 -= accum2; accum0 += accum2; - accum2 = 0; - FOR_LIMB(i, j + 1, 8, { - accum0 -= - widemul(a[8 + j - i], b[i]); - accum2 += - widemul(aa[8 + j - i], - bb[i]); - accum1 += widemul(a[16 + j - i], b[8 + i]); - } - ); - accum1 += accum2; - accum0 += accum2; - c[j] = ((uint32_t)(accum0)) & mask; - c[j + 8] = ((uint32_t)(accum1)) & mask; - accum0 >>= 28; accum1 >>= 28; - }); + for (j = 0; j < 8; j++) { + accum2 = 0; + for (i = 0; i < j + 1; i++) { + accum2 += widemul(a[j - i], b[i]); + accum1 += widemul(aa[j - i], bb[i]); + accum0 += widemul(a[8 + j - i], b[8 + i]); + } + accum1 -= accum2; + accum0 += accum2; + accum2 = 0; + for (i = j + 1; i < 8; i++) { + accum0 -= widemul(a[8 + j - i], b[i]); + accum2 += widemul(aa[8 + j - i], bb[i]); + accum1 += widemul(a[16 + j - i], b[8 + i]); + } + accum1 += accum2; + accum0 += accum2; + c[j] = ((uint32_t)(accum0)) & mask; + c[j + 8] = ((uint32_t)(accum1)) & mask; + accum0 >>= 28; + accum1 >>= 28; + } accum0 += accum1; accum0 += c[8]; @@ -81,11 +71,14 @@ void gf_mulw_unsigned(gf_s * RESTRICT cs, const gf as, uint32_t b) assert(b < 1 << 28); - FOR_LIMB(i, 0, 8, { - accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i + 8]); - c[i] = accum0 & mask; accum0 >>= 28; - c[i + 8] = accum8 & mask; accum8 >>= 28; - }); + for (i = 0; i < 8; i++) { + accum0 += widemul(b, a[i]); + accum8 += widemul(b, a[i + 8]); + c[i] = accum0 & mask; + accum0 >>= 28; + c[i + 8] = accum8 & mask; + accum8 >>= 28; + } accum0 += accum8 + c[8]; c[8] = ((uint32_t)accum0) & mask; diff --git a/crypto/ec/curve448/f_generic.c b/crypto/ec/curve448/f_generic.c index 341eb3f3b0..6babea6e41 100644 --- a/crypto/ec/curve448/f_generic.c +++ b/crypto/ec/curve448/f_generic.c @@ -30,7 +30,7 @@ void gf_serialize(uint8_t serial[SER_BYTES], const gf x, int with_hibit) if (!with_hibit) assert(gf_hibit(red) == 0); - UNROLL for (i = 0; i < (with_hibit ? X_SER_BYTES : SER_BYTES); i++) { + for (i = 0; i < (with_hibit ? X_SER_BYTES : SER_BYTES); i++) { if (fill < 8 && j < NLIMBS) { buffer |= ((dword_t) red->limb[LIMBPERM(j)]) << fill; fill += LIMB_PLACE_VALUE(LIMBPERM(j)); @@ -73,9 +73,11 @@ mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit, unsigned int i; mask_t succ; - UNROLL for (i = 0; i < NLIMBS; i++) { - UNROLL while (fill < LIMB_PLACE_VALUE(LIMBPERM(i)) && j < nbytes) { - uint8_t sj = serial[j]; + for (i = 0; i < NLIMBS; i++) { + while (fill < LIMB_PLACE_VALUE(LIMBPERM(i)) && j < nbytes) { + uint8_t sj; + + sj = serial[j]; if (j == nbytes - 1) sj &= ~hi_nmask; buffer |= ((dword_t) sj) << fill; diff --git a/crypto/ec/curve448/word.h b/crypto/ec/curve448/word.h index 6067404e65..c739b70d51 100644 --- a/crypto/ec/curve448/word.h +++ b/crypto/ec/curve448/word.h @@ -59,18 +59,6 @@ typedef int64_t dsword_t; # error "For now we only support 32- and 64-bit architectures." # endif - -/* PERF: vectorize vs unroll */ -# ifdef __clang__ -# if 100*__clang_major__ + __clang_minor__ > 305 -# define UNROLL _Pragma("clang loop unroll(full)") -# endif -# endif - -# ifndef UNROLL -# define UNROLL -# endif - /* * The plan on booleans: The external interface uses c448_bool_t, but this * might be a different size than our particular arch's word_t (and thus