diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl index ef7c74798d..2498a1d646 100644 --- a/crypto/modes/asm/ghashv8-armx.pl +++ b/crypto/modes/asm/ghashv8-armx.pl @@ -423,10 +423,7 @@ gcm_ghash_v8_4x: #ifndef __ARMEB__ vrev64.8 $Xl,$Xl #endif - b .Loop4x -.align 4 -.Loop4x: vld1.64 {$I0-$j3},[$inp],#64 #ifndef __ARMEB__ vrev64.8 $j1,$j1 @@ -461,6 +458,13 @@ gcm_ghash_v8_4x: veor $Yh,$Yh,$I1 veor $Ym,$Ym,$j1 + subs $len,$len,#64 + b.eq .Ltail4x + + b .Loop4x + +.align 4 +.Loop4x: veor $t0,$I0,$Xl vext.8 $IN,$t0,$t0,#8 @@ -489,9 +493,72 @@ gcm_ghash_v8_4x: veor $Xl,$Xl,$t2 vext.8 $Xl,$Xl,$Xl,#8 + vld1.64 {$I0-$j3},[$inp],#64 +#ifndef __ARMEB__ + vrev64.8 $j1,$j1 + vrev64.8 $j2,$j2 + vrev64.8 $j3,$j3 + vrev64.8 $I0,$I0 +#endif + vext.8 $I3,$j3,$j3,#8 + vext.8 $I2,$j2,$j2,#8 + vext.8 $I1,$j1,$j1,#8 + + vpmull.p64 $Yl,$H,$I3 @ H·Ii+3 + veor $j3,$j3,$I3 + vpmull2.p64 $Yh,$H,$I3 + vpmull.p64 $Ym,$Hhl,$j3 + + vpmull.p64 $t0,$H2,$I2 @ H^2·Ii+2 + veor $j2,$j2,$I2 + vpmull2.p64 $I2,$H2,$I2 + vpmull2.p64 $j2,$Hhl,$j2 + + veor $Yl,$Yl,$t0 + veor $Yh,$Yh,$I2 + veor $Ym,$Ym,$j2 + + vpmull.p64 $j3,$H3,$I1 @ H^3·Ii+1 + veor $j1,$j1,$I1 + vpmull2.p64 $I1,$H3,$I1 + vpmull.p64 $j1,$H34,$j1 + + veor $Yl,$Yl,$j3 + veor $Yh,$Yh,$I1 + veor $Ym,$Ym,$j1 + subs $len,$len,#64 b.ne .Loop4x +.Ltail4x: + veor $t0,$I0,$Xl + vext.8 $IN,$t0,$t0,#8 + + vpmull.p64 $Xl,$H4,$IN @ H^4·(Xi+Ii) + veor $t0,$t0,$IN + vpmull2.p64 $Xh,$H4,$IN + vpmull2.p64 $Xm,$H34,$t0 + + veor $Xl,$Xl,$Yl + veor $Xh,$Xh,$Yh + veor $Xm,$Xm,$Ym + + vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing + veor $t2,$Xl,$Xh + veor $Xm,$Xm,$t1 + veor $Xm,$Xm,$t2 + + vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction + vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result + vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl + veor $Xl,$Xm,$t2 + + vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction + vpmull.p64 $Xl,$Xl,$xC2 + veor $t2,$t2,$Xh + veor $Xl,$Xl,$t2 + vext.8 $Xl,$Xl,$Xl,#8 + #ifndef __ARMEB__ vrev64.8 $Xl,$Xl #endif