From d6e4287c9726691e800bff221be71edd894a3c6a Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 17 Apr 2019 21:30:39 +0200 Subject: [PATCH] aes/asm/aesv8-armx.pl: ~20% improvement on ThunderX2. Reviewed-by: Tim Hudson Reviewed-by: Richard Levitte (Merged from https://github.com/openssl/openssl/pull/8776) --- crypto/aes/asm/aesv8-armx.pl | 395 ++++++++++++++++++++++++++++++++++- 1 file changed, 389 insertions(+), 6 deletions(-) diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl index b708a61d50..3b3a53bf30 100755 --- a/crypto/aes/asm/aesv8-armx.pl +++ b/crypto/aes/asm/aesv8-armx.pl @@ -27,19 +27,34 @@ # CBC encrypt case. On Cortex-A57 parallelizable mode performance # seems to be limited by sheer amount of NEON instructions... # +# April 2019 +# +# Key to performance of parallelize-able modes is round instruction +# interleaving. But which factor to use? There is optimal one for +# each combination of instruction latency and issue rate, beyond +# which increasing interleave factor doesn't pay off. While on cons +# side we have code size increase and resource waste on platforms for +# which interleave factor is too high. In other words you want it to +# be just right. So far interleave factor of 3x was serving well all +# platforms. But for ThunderX2 optimal interleave factor was measured +# to be 5x... +# # Performance in cycles per byte processed with 128-bit key: # # CBC enc CBC dec CTR # Apple A7 2.39 1.20 1.20 -# Cortex-A53 1.32 1.29 1.46 -# Cortex-A57(*) 1.95 0.85 0.93 -# Denver 1.96 0.86 0.80 -# Mongoose 1.33 1.20 1.20 -# Kryo 1.26 0.94 1.00 -# ThunderX2 5.95 1.53 1.55 +# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 +# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 +# Cortex-A72 1.33 0.85/0.88 0.92/0.96 +# Denver 1.96 0.65/0.86 0.76/0.80 +# Mongoose 1.33 1.23/1.20 1.30/1.20 +# Kryo 1.26 0.87/0.94 1.00/1.00 +# ThunderX2 5.95 1.25 1.30 # # (*) original 3.64/1.34/1.32 results were for r0p0 revision # and are still same even for updated module; +# (**) numbers after slash are for 32-bit code, which is 3x- +# interleaved; $flavour = shift; $output = shift; @@ -524,6 +539,13 @@ $code.=<<___; ___ { my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); + +my ($dat3,$in3,$tmp3); # used only in 64-bit mode +my ($dat4,$in4,$tmp4); +if ($flavour =~ /64/) { + ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); +} + $code.=<<___; .align 5 .Lcbc_dec: @@ -540,7 +562,196 @@ $code.=<<___; vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 vorr $in2,$dat2,$dat2 +___ +$code.=<<___ if ($flavour =~ /64/); + cmp $len,#32 + b.lo .Loop3x_cbc_dec + vld1.8 {$dat3},[$inp],#16 + vld1.8 {$dat4},[$inp],#16 + sub $len,$len,#32 // bias + mov $cnt,$rounds + vorr $in3,$dat3,$dat3 + vorr $in4,$dat4,$dat4 + +.Loop5x_cbc_dec: + aesd $dat0,q8 + aesimc $dat0,$dat0 + aesd $dat1,q8 + aesimc $dat1,$dat1 + aesd $dat2,q8 + aesimc $dat2,$dat2 + aesd $dat3,q8 + aesimc $dat3,$dat3 + aesd $dat4,q8 + aesimc $dat4,$dat4 + vld1.32 {q8},[$key_],#16 + subs $cnt,$cnt,#2 + aesd $dat0,q9 + aesimc $dat0,$dat0 + aesd $dat1,q9 + aesimc $dat1,$dat1 + aesd $dat2,q9 + aesimc $dat2,$dat2 + aesd $dat3,q9 + aesimc $dat3,$dat3 + aesd $dat4,q9 + aesimc $dat4,$dat4 + vld1.32 {q9},[$key_],#16 + b.gt .Loop5x_cbc_dec + + aesd $dat0,q8 + aesimc $dat0,$dat0 + aesd $dat1,q8 + aesimc $dat1,$dat1 + aesd $dat2,q8 + aesimc $dat2,$dat2 + aesd $dat3,q8 + aesimc $dat3,$dat3 + aesd $dat4,q8 + aesimc $dat4,$dat4 + cmp $len,#0x40 // because .Lcbc_tail4x + sub $len,$len,#0x50 + + aesd $dat0,q9 + aesimc $dat0,$dat0 + aesd $dat1,q9 + aesimc $dat1,$dat1 + aesd $dat2,q9 + aesimc $dat2,$dat2 + aesd $dat3,q9 + aesimc $dat3,$dat3 + aesd $dat4,q9 + aesimc $dat4,$dat4 + csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo + mov $key_,$key + + aesd $dat0,q10 + aesimc $dat0,$dat0 + aesd $dat1,q10 + aesimc $dat1,$dat1 + aesd $dat2,q10 + aesimc $dat2,$dat2 + aesd $dat3,q10 + aesimc $dat3,$dat3 + aesd $dat4,q10 + aesimc $dat4,$dat4 + add $inp,$inp,x6 // $inp is adjusted in such way that + // at exit from the loop $dat1-$dat4 + // are loaded with last "words" + add x6,$len,#0x60 // because .Lcbc_tail4x + + aesd $dat0,q11 + aesimc $dat0,$dat0 + aesd $dat1,q11 + aesimc $dat1,$dat1 + aesd $dat2,q11 + aesimc $dat2,$dat2 + aesd $dat3,q11 + aesimc $dat3,$dat3 + aesd $dat4,q11 + aesimc $dat4,$dat4 + + aesd $dat0,q12 + aesimc $dat0,$dat0 + aesd $dat1,q12 + aesimc $dat1,$dat1 + aesd $dat2,q12 + aesimc $dat2,$dat2 + aesd $dat3,q12 + aesimc $dat3,$dat3 + aesd $dat4,q12 + aesimc $dat4,$dat4 + + aesd $dat0,q13 + aesimc $dat0,$dat0 + aesd $dat1,q13 + aesimc $dat1,$dat1 + aesd $dat2,q13 + aesimc $dat2,$dat2 + aesd $dat3,q13 + aesimc $dat3,$dat3 + aesd $dat4,q13 + aesimc $dat4,$dat4 + + aesd $dat0,q14 + aesimc $dat0,$dat0 + aesd $dat1,q14 + aesimc $dat1,$dat1 + aesd $dat2,q14 + aesimc $dat2,$dat2 + aesd $dat3,q14 + aesimc $dat3,$dat3 + aesd $dat4,q14 + aesimc $dat4,$dat4 + + veor $tmp0,$ivec,$rndlast + aesd $dat0,q15 + veor $tmp1,$in0,$rndlast + vld1.8 {$in0},[$inp],#16 + aesd $dat1,q15 + veor $tmp2,$in1,$rndlast + vld1.8 {$in1},[$inp],#16 + aesd $dat2,q15 + veor $tmp3,$in2,$rndlast + vld1.8 {$in2},[$inp],#16 + aesd $dat3,q15 + veor $tmp4,$in3,$rndlast + vld1.8 {$in3},[$inp],#16 + aesd $dat4,q15 + vorr $ivec,$in4,$in4 + vld1.8 {$in4},[$inp],#16 + cbz x6,.Lcbc_tail4x + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + veor $tmp0,$tmp0,$dat0 + vorr $dat0,$in0,$in0 + veor $tmp1,$tmp1,$dat1 + vorr $dat1,$in1,$in1 + veor $tmp2,$tmp2,$dat2 + vorr $dat2,$in2,$in2 + veor $tmp3,$tmp3,$dat3 + vorr $dat3,$in3,$in3 + veor $tmp4,$tmp4,$dat4 + vst1.8 {$tmp0},[$out],#16 + vorr $dat4,$in4,$in4 + vst1.8 {$tmp1},[$out],#16 + mov $cnt,$rounds + vst1.8 {$tmp2},[$out],#16 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + vst1.8 {$tmp3},[$out],#16 + vst1.8 {$tmp4},[$out],#16 + b.hs .Loop5x_cbc_dec + + add $len,$len,#0x50 + cbz $len,.Lcbc_done + + add $cnt,$rounds,#2 + subs $len,$len,#0x30 + vorr $dat0,$in2,$in2 + vorr $in0,$in2,$in2 + vorr $dat1,$in3,$in3 + vorr $in1,$in3,$in3 + vorr $dat2,$in4,$in4 + vorr $in2,$in4,$in4 + b.lo .Lcbc_dec_tail + + b .Loop3x_cbc_dec + +.align 4 +.Lcbc_tail4x: + veor $tmp1,$tmp0,$dat1 + veor $tmp2,$tmp2,$dat2 + veor $tmp3,$tmp3,$dat3 + veor $tmp4,$tmp4,$dat4 + vst1.8 {$tmp1},[$out],#16 + vst1.8 {$tmp2},[$out],#16 + vst1.8 {$tmp3},[$out],#16 + vst1.8 {$tmp4},[$out],#16 + + b .Lcbc_done +.align 4 +___ +$code.=<<___; .Loop3x_cbc_dec: aesd $dat0,q8 aesimc $dat0,$dat0 @@ -701,6 +912,9 @@ my $step="x12"; # aliases with $tctr2 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); +# used only in 64-bit mode... +my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); + my ($dat,$tmp)=($dat0,$tmp0); ### q8-q15 preloaded key schedule @@ -753,6 +967,175 @@ $code.=<<___; rev $tctr2, $ctr sub $len,$len,#3 // bias vmov.32 ${dat2}[3],$tctr2 +___ +$code.=<<___ if ($flavour =~ /64/); + cmp $len,#2 + b.lo .Loop3x_ctr32 + + add w13,$ctr,#1 + add w14,$ctr,#2 + vorr $dat3,$dat0,$dat0 + rev w13,w13 + vorr $dat4,$dat0,$dat0 + rev w14,w14 + vmov.32 ${dat3}[3],w13 + sub $len,$len,#2 // bias + vmov.32 ${dat4}[3],w14 + add $ctr,$ctr,#2 + b .Loop5x_ctr32 + +.align 4 +.Loop5x_ctr32: + aese $dat0,q8 + aesmc $dat0,$dat0 + aese $dat1,q8 + aesmc $dat1,$dat1 + aese $dat2,q8 + aesmc $dat2,$dat2 + aese $dat3,q8 + aesmc $dat3,$dat3 + aese $dat4,q8 + aesmc $dat4,$dat4 + vld1.32 {q8},[$key_],#16 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aesmc $dat0,$dat0 + aese $dat1,q9 + aesmc $dat1,$dat1 + aese $dat2,q9 + aesmc $dat2,$dat2 + aese $dat3,q9 + aesmc $dat3,$dat3 + aese $dat4,q9 + aesmc $dat4,$dat4 + vld1.32 {q9},[$key_],#16 + b.gt .Loop5x_ctr32 + + mov $key_,$key + aese $dat0,q8 + aesmc $dat0,$dat0 + aese $dat1,q8 + aesmc $dat1,$dat1 + aese $dat2,q8 + aesmc $dat2,$dat2 + aese $dat3,q8 + aesmc $dat3,$dat3 + aese $dat4,q8 + aesmc $dat4,$dat4 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + + aese $dat0,q9 + aesmc $dat0,$dat0 + aese $dat1,q9 + aesmc $dat1,$dat1 + aese $dat2,q9 + aesmc $dat2,$dat2 + aese $dat3,q9 + aesmc $dat3,$dat3 + aese $dat4,q9 + aesmc $dat4,$dat4 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + + aese $dat0,q12 + aesmc $dat0,$dat0 + add $tctr0,$ctr,#1 + add $tctr1,$ctr,#2 + aese $dat1,q12 + aesmc $dat1,$dat1 + add $tctr2,$ctr,#3 + add w13,$ctr,#4 + aese $dat2,q12 + aesmc $dat2,$dat2 + add w14,$ctr,#5 + rev $tctr0,$tctr0 + aese $dat3,q12 + aesmc $dat3,$dat3 + rev $tctr1,$tctr1 + rev $tctr2,$tctr2 + aese $dat4,q12 + aesmc $dat4,$dat4 + rev w13,w13 + rev w14,w14 + + aese $dat0,q13 + aesmc $dat0,$dat0 + aese $dat1,q13 + aesmc $dat1,$dat1 + aese $dat2,q13 + aesmc $dat2,$dat2 + aese $dat3,q13 + aesmc $dat3,$dat3 + aese $dat4,q13 + aesmc $dat4,$dat4 + + aese $dat0,q14 + aesmc $dat0,$dat0 + vld1.8 {$in0},[$inp],#16 + aese $dat1,q14 + aesmc $dat1,$dat1 + vld1.8 {$in1},[$inp],#16 + aese $dat2,q14 + aesmc $dat2,$dat2 + vld1.8 {$in2},[$inp],#16 + aese $dat3,q14 + aesmc $dat3,$dat3 + vld1.8 {$in3},[$inp],#16 + aese $dat4,q14 + aesmc $dat4,$dat4 + vld1.8 {$in4},[$inp],#16 + + aese $dat0,q15 + veor $in0,$in0,$rndlast + aese $dat1,q15 + veor $in1,$in1,$rndlast + aese $dat2,q15 + veor $in2,$in2,$rndlast + aese $dat3,q15 + veor $in3,$in3,$rndlast + aese $dat4,q15 + veor $in4,$in4,$rndlast + + veor $in0,$in0,$dat0 + vorr $dat0,$ivec,$ivec + veor $in1,$in1,$dat1 + vorr $dat1,$ivec,$ivec + veor $in2,$in2,$dat2 + vorr $dat2,$ivec,$ivec + veor $in3,$in3,$dat3 + vorr $dat3,$ivec,$ivec + veor $in4,$in4,$dat4 + vorr $dat4,$ivec,$ivec + + vst1.8 {$in0},[$out],#16 + vmov.32 ${dat0}[3],$tctr0 + vst1.8 {$in1},[$out],#16 + vmov.32 ${dat1}[3],$tctr1 + vst1.8 {$in2},[$out],#16 + vmov.32 ${dat2}[3],$tctr2 + vst1.8 {$in3},[$out],#16 + vmov.32 ${dat3}[3],w13 + vst1.8 {$in4},[$out],#16 + vmov.32 ${dat4}[3],w14 + + mov $cnt,$rounds + cbz $len,.Lctr32_done + + add $ctr,$ctr,#5 + subs $len,$len,#5 + b.hs .Loop5x_ctr32 + + add $len,$len,#5 + sub $ctr,$ctr,#5 + + cmp $len,#2 + mov $step,#16 + cclr $step,lo + b.ls .Lctr32_tail + + sub $len,$len,#3 // bias + add $ctr,$ctr,#3 +___ +$code.=<<___; b .Loop3x_ctr32 .align 4