poly1305/asm/poly1305-x86_64.pl: add poly1305_blocks_vpmadd52_4x.
As hinted by its name new subroutine processes 4 input blocks in parallel. It still operates on 256-bit registers and is just another step toward full-blown AVX512IFMA procedure. Reviewed-by: Rich Salz <rsalz@openssl.org>
This commit is contained in:
parent
a25cef89fd
commit
c2b935904a
1 changed files with 474 additions and 2 deletions
|
@ -2716,6 +2716,17 @@ if ($avx>3) {
|
|||
# path longer. In other words, even though base 2^44 reduction might
|
||||
# look less elegant, overall critical path is actually shorter...
|
||||
|
||||
########################################################################
|
||||
# Layout of opaque area is following.
|
||||
#
|
||||
# unsigned __int64 h[3]; # current hash value base 2^44
|
||||
# unsigned __int64 s[2]; # key value*20 base 2^44
|
||||
# unsigned __int64 r[3]; # key value base 2^44
|
||||
# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
|
||||
# # r^n positions reflect
|
||||
# # placement in register, not
|
||||
# # memory, R[3] is R[1]*20
|
||||
|
||||
$code.=<<___;
|
||||
.type poly1305_init_base2_44,\@function,3
|
||||
.align 32
|
||||
|
@ -2748,6 +2759,7 @@ poly1305_init_base2_44:
|
|||
shl \$2,%rcx # magic <<2
|
||||
mov %rax,24($ctx) # s1
|
||||
mov %rcx,32($ctx) # s2
|
||||
movq \$-1,64($ctx) # write impossible value
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /elf32/);
|
||||
mov %r10,0(%rdx)
|
||||
|
@ -2774,11 +2786,29 @@ poly1305_blocks_vpmadd52:
|
|||
shr \$4,$len
|
||||
jz .Lno_data_vpmadd52 # too short
|
||||
|
||||
shl \$40,$padbit
|
||||
mov 64($ctx),%r8 # peek on power of the key
|
||||
|
||||
# if powers of the key are not calculated yet, process up to 3
|
||||
# blocks with this single-block subroutine, otherwise ensure that
|
||||
# length is divisible by 2 blocks and pass the rest down to next
|
||||
# subroutine...
|
||||
|
||||
mov \$3,%rax
|
||||
mov \$1,%r10
|
||||
cmp \$4,$len # is input long
|
||||
cmovae %r10,%rax
|
||||
test %r8,%r8 # is power value impossible?
|
||||
cmovns %r10,%rax
|
||||
|
||||
and $len,%rax # is input of favourable length?
|
||||
jz .Lblocks_vpmadd52_4x
|
||||
|
||||
sub %rax,$len
|
||||
mov \$7,%r10d
|
||||
mov \$1,%r11d
|
||||
kmovw %r10d,%k7
|
||||
lea .L2_44_inp_permd(%rip),%r10
|
||||
shl \$40,$padbit
|
||||
kmovw %r11d,%k1
|
||||
|
||||
vmovq $padbit,%x#$PAD
|
||||
|
@ -2849,16 +2879,451 @@ poly1305_blocks_vpmadd52:
|
|||
|
||||
vpaddq $T0,$Dlo,$Dlo
|
||||
|
||||
dec $len # len-=16
|
||||
dec %rax # len-=16
|
||||
jnz .Loop_vpmadd52
|
||||
|
||||
vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
|
||||
|
||||
test $len,$len
|
||||
jnz .Lblocks_vpmadd52_4x
|
||||
|
||||
.Lno_data_vpmadd52:
|
||||
ret
|
||||
.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
|
||||
___
|
||||
}
|
||||
{
|
||||
my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
|
||||
my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
|
||||
my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
|
||||
|
||||
$code.=<<___;
|
||||
.type poly1305_blocks_vpmadd52_4x,\@function,4
|
||||
.align 32
|
||||
poly1305_blocks_vpmadd52_4x:
|
||||
shr \$4,$len
|
||||
jz .Lno_data_vpmadd52_4x # too short
|
||||
|
||||
shl \$40,$padbit
|
||||
mov 64($ctx),%r8 # peek on power of the key
|
||||
|
||||
.Lblocks_vpmadd52_4x:
|
||||
vpbroadcastq $padbit,$PAD
|
||||
|
||||
vmovdqa64 .Lx_mask44(%rip),$mask44
|
||||
mov \$5,%eax
|
||||
vmovdqa64 .Lx_mask42(%rip),$mask42
|
||||
kmovw %eax,%k1 # used in 2x path
|
||||
|
||||
test %r8,%r8 # is power value impossible?
|
||||
js .Linit_vpmadd52 # if it is, then init R[4]
|
||||
|
||||
vmovq 0($ctx),%x#$H0 # load current hash value
|
||||
vmovq 8($ctx),%x#$H1
|
||||
vmovq 16($ctx),%x#$H2
|
||||
|
||||
test \$3,$len # is length 4*n+2?
|
||||
jnz .Lblocks_vpmadd52_2x_do
|
||||
|
||||
.Lblocks_vpmadd52_4x_do:
|
||||
vpbroadcastq 64($ctx),$R0 # load 4th power of the key
|
||||
vpbroadcastq 96($ctx),$R1
|
||||
vpbroadcastq 128($ctx),$R2
|
||||
vpbroadcastq 160($ctx),$S1
|
||||
|
||||
.Lblocks_vpmadd52_4x_key_loaded:
|
||||
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
|
||||
vpaddq $R2,$S2,$S2
|
||||
vpsllq \$2,$S2,$S2
|
||||
|
||||
vmovdqu64 16*0($inp),$T2 # load data
|
||||
vmovdqu64 16*2($inp),$T3
|
||||
lea 16*4($inp),$inp
|
||||
|
||||
vpunpcklqdq $T3,$T2,$T1 # transpose data
|
||||
vpunpckhqdq $T3,$T2,$T3
|
||||
|
||||
# at this point 64-bit lanes are ordered as 3-1-2-0
|
||||
|
||||
vpsrlq \$24,$T3,$T2 # splat the data
|
||||
vporq $PAD,$T2,$T2
|
||||
vpaddq $T2,$H2,$H2 # accumulate input
|
||||
vpandq $mask44,$T1,$T0
|
||||
vpsrlq \$44,$T1,$T1
|
||||
vpsllq \$20,$T3,$T3
|
||||
vporq $T3,$T1,$T1
|
||||
vpandq $mask44,$T1,$T1
|
||||
|
||||
sub \$4,$len
|
||||
jz .Ltail_vpmadd52_4x
|
||||
jmp .Loop_vpmadd52_4x
|
||||
ud2
|
||||
|
||||
.align 32
|
||||
.Linit_vpmadd52:
|
||||
vmovq 24($ctx),%x#$S1 # load key
|
||||
vmovq 56($ctx),%x#$H2
|
||||
vmovq 32($ctx),%x#$S2
|
||||
vmovq 40($ctx),%x#$R0
|
||||
vmovq 48($ctx),%x#$R1
|
||||
|
||||
vmovdqa $R0,$H0
|
||||
vmovdqa $R1,$H1
|
||||
vmovdqa $H2,$R2
|
||||
|
||||
mov \$2,%eax
|
||||
|
||||
.Lmul_init_vpmadd52:
|
||||
vpxorq $D0lo,$D0lo,$D0lo
|
||||
vpmadd52luq $H2,$S1,$D0lo
|
||||
vpxorq $D0hi,$D0hi,$D0hi
|
||||
vpmadd52huq $H2,$S1,$D0hi
|
||||
vpxorq $D1lo,$D1lo,$D1lo
|
||||
vpmadd52luq $H2,$S2,$D1lo
|
||||
vpxorq $D1hi,$D1hi,$D1hi
|
||||
vpmadd52huq $H2,$S2,$D1hi
|
||||
vpxorq $D2lo,$D2lo,$D2lo
|
||||
vpmadd52luq $H2,$R0,$D2lo
|
||||
vpxorq $D2hi,$D2hi,$D2hi
|
||||
vpmadd52huq $H2,$R0,$D2hi
|
||||
|
||||
vpmadd52luq $H0,$R0,$D0lo
|
||||
vpmadd52huq $H0,$R0,$D0hi
|
||||
vpmadd52luq $H0,$R1,$D1lo
|
||||
vpmadd52huq $H0,$R1,$D1hi
|
||||
vpmadd52luq $H0,$R2,$D2lo
|
||||
vpmadd52huq $H0,$R2,$D2hi
|
||||
|
||||
vpmadd52luq $H1,$S2,$D0lo
|
||||
vpmadd52huq $H1,$S2,$D0hi
|
||||
vpmadd52luq $H1,$R0,$D1lo
|
||||
vpmadd52huq $H1,$R0,$D1hi
|
||||
vpmadd52luq $H1,$R1,$D2lo
|
||||
vpmadd52huq $H1,$R1,$D2hi
|
||||
|
||||
################################################################
|
||||
# partial reduction
|
||||
vpsrlq \$44,$D0lo,$tmp
|
||||
vpsllq \$8,$D0hi,$D0hi
|
||||
vpandq $mask44,$D0lo,$H0
|
||||
vpaddq $tmp,$D0hi,$D0hi
|
||||
|
||||
vpaddq $D0hi,$D1lo,$D1lo
|
||||
|
||||
vpsrlq \$44,$D1lo,$tmp
|
||||
vpsllq \$8,$D1hi,$D1hi
|
||||
vpandq $mask44,$D1lo,$H1
|
||||
vpaddq $tmp,$D1hi,$D1hi
|
||||
|
||||
vpaddq $D1hi,$D2lo,$D2lo
|
||||
|
||||
vpsrlq \$42,$D2lo,$tmp
|
||||
vpsllq \$10,$D2hi,$D2hi
|
||||
vpandq $mask42,$D2lo,$H2
|
||||
vpaddq $tmp,$D2hi,$D2hi
|
||||
|
||||
vpaddq $D2hi,$H0,$H0
|
||||
vpsllq \$2,$D2hi,$D2hi
|
||||
|
||||
vpaddq $D2hi,$H0,$H0
|
||||
|
||||
vpsrlq \$44,$H0,$tmp # additional step
|
||||
vpandq $mask44,$H0,$H0
|
||||
|
||||
vpaddq $tmp,$H1,$H1
|
||||
|
||||
dec %eax
|
||||
jz .Ldone_init_vpmadd52
|
||||
|
||||
vpunpcklqdq $R1,$H1,$R1 # 1,2
|
||||
vpbroadcastq %x#$H1,%x#$H1 # 2,2
|
||||
vpunpcklqdq $R2,$H2,$R2
|
||||
vpbroadcastq %x#$H2,%x#$H2
|
||||
vpunpcklqdq $R0,$H0,$R0
|
||||
vpbroadcastq %x#$H0,%x#$H0
|
||||
|
||||
vpsllq \$2,$R1,$S1 # S1 = R1*5*4
|
||||
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
|
||||
vpaddq $R1,$S1,$S1
|
||||
vpaddq $R2,$S2,$S2
|
||||
vpsllq \$2,$S1,$S1
|
||||
vpsllq \$2,$S2,$S2
|
||||
|
||||
jmp .Lmul_init_vpmadd52
|
||||
ud2
|
||||
|
||||
.align 32
|
||||
.Ldone_init_vpmadd52:
|
||||
vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
|
||||
vinserti128 \$1,%x#$R2,$H2,$R2
|
||||
vinserti128 \$1,%x#$R0,$H0,$R0
|
||||
|
||||
vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
|
||||
vpermq \$0b11011000,$R2,$R2
|
||||
vpermq \$0b11011000,$R0,$R0
|
||||
|
||||
vpsllq \$2,$R1,$S1 # S1 = R1*5*4
|
||||
vpaddq $R1,$S1,$S1
|
||||
vpsllq \$2,$S1,$S1
|
||||
|
||||
vmovq 0($ctx),%x#$H0 # load current hash value
|
||||
vmovq 8($ctx),%x#$H1
|
||||
vmovq 16($ctx),%x#$H2
|
||||
|
||||
test \$3,$len # is length 4*n+2?
|
||||
jnz .Ldone_init_vpmadd52_2x
|
||||
|
||||
vmovdqu64 $R0,64($ctx) # save key powers
|
||||
vpbroadcastq %x#$R0,$R0 # broadcast 4th power
|
||||
vmovdqu64 $R1,96($ctx)
|
||||
vpbroadcastq %x#$R1,$R1
|
||||
vmovdqu64 $R2,128($ctx)
|
||||
vpbroadcastq %x#$R2,$R2
|
||||
vmovdqu64 $S1,160($ctx)
|
||||
vpbroadcastq %x#$S1,$S1
|
||||
|
||||
jmp .Lblocks_vpmadd52_4x_key_loaded
|
||||
ud2
|
||||
|
||||
.align 32
|
||||
.Ldone_init_vpmadd52_2x:
|
||||
vmovdqu64 $R0,64($ctx) # save key powers
|
||||
vpsrldq \$8,$R0,$R0 # 0-1-0-2
|
||||
vmovdqu64 $R1,96($ctx)
|
||||
vpsrldq \$8,$R1,$R1
|
||||
vmovdqu64 $R2,128($ctx)
|
||||
vpsrldq \$8,$R2,$R2
|
||||
vmovdqu64 $S1,160($ctx)
|
||||
vpsrldq \$8,$S1,$S1
|
||||
jmp .Lblocks_vpmadd52_2x_key_loaded
|
||||
ud2
|
||||
|
||||
.align 32
|
||||
.Lblocks_vpmadd52_2x_do:
|
||||
vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
|
||||
vmovdqu64 160+8($ctx),${S1}{%k1}{z}
|
||||
vmovdqu64 64+8($ctx),${R0}{%k1}{z}
|
||||
vmovdqu64 96+8($ctx),${R1}{%k1}{z}
|
||||
|
||||
.Lblocks_vpmadd52_2x_key_loaded:
|
||||
vmovdqu64 16*0($inp),$T2 # load data
|
||||
vpxorq $T3,$T3,$T3
|
||||
lea 16*2($inp),$inp
|
||||
|
||||
vpunpcklqdq $T3,$T2,$T1 # transpose data
|
||||
vpunpckhqdq $T3,$T2,$T3
|
||||
|
||||
# at this point 64-bit lanes are ordered as x-1-x-0
|
||||
|
||||
vpsrlq \$24,$T3,$T2 # splat the data
|
||||
vporq $PAD,$T2,$T2
|
||||
vpaddq $T2,$H2,$H2 # accumulate input
|
||||
vpandq $mask44,$T1,$T0
|
||||
vpsrlq \$44,$T1,$T1
|
||||
vpsllq \$20,$T3,$T3
|
||||
vporq $T3,$T1,$T1
|
||||
vpandq $mask44,$T1,$T1
|
||||
|
||||
jmp .Ltail_vpmadd52_2x
|
||||
ud2
|
||||
|
||||
.align 32
|
||||
.Loop_vpmadd52_4x:
|
||||
#vpaddq $T2,$H2,$H2 # accumulate input
|
||||
vpaddq $T0,$H0,$H0
|
||||
vpaddq $T1,$H1,$H1
|
||||
|
||||
vpxorq $D0lo,$D0lo,$D0lo
|
||||
vpmadd52luq $H2,$S1,$D0lo
|
||||
vpxorq $D0hi,$D0hi,$D0hi
|
||||
vpmadd52huq $H2,$S1,$D0hi
|
||||
vpxorq $D1lo,$D1lo,$D1lo
|
||||
vpmadd52luq $H2,$S2,$D1lo
|
||||
vpxorq $D1hi,$D1hi,$D1hi
|
||||
vpmadd52huq $H2,$S2,$D1hi
|
||||
vpxorq $D2lo,$D2lo,$D2lo
|
||||
vpmadd52luq $H2,$R0,$D2lo
|
||||
vpxorq $D2hi,$D2hi,$D2hi
|
||||
vpmadd52huq $H2,$R0,$D2hi
|
||||
|
||||
vmovdqu64 16*0($inp),$T2 # load data
|
||||
vmovdqu64 16*2($inp),$T3
|
||||
lea 16*4($inp),$inp
|
||||
vpmadd52luq $H0,$R0,$D0lo
|
||||
vpmadd52huq $H0,$R0,$D0hi
|
||||
vpmadd52luq $H0,$R1,$D1lo
|
||||
vpmadd52huq $H0,$R1,$D1hi
|
||||
vpmadd52luq $H0,$R2,$D2lo
|
||||
vpmadd52huq $H0,$R2,$D2hi
|
||||
|
||||
vpunpcklqdq $T3,$T2,$T1 # transpose data
|
||||
vpunpckhqdq $T3,$T2,$T3
|
||||
vpmadd52luq $H1,$S2,$D0lo
|
||||
vpmadd52huq $H1,$S2,$D0hi
|
||||
vpmadd52luq $H1,$R0,$D1lo
|
||||
vpmadd52huq $H1,$R0,$D1hi
|
||||
vpmadd52luq $H1,$R1,$D2lo
|
||||
vpmadd52huq $H1,$R1,$D2hi
|
||||
|
||||
################################################################
|
||||
# partial reduction (interleaved with data splat)
|
||||
vpsrlq \$44,$D0lo,$tmp
|
||||
vpsllq \$8,$D0hi,$D0hi
|
||||
vpandq $mask44,$D0lo,$H0
|
||||
vpaddq $tmp,$D0hi,$D0hi
|
||||
|
||||
vpsrlq \$24,$T3,$T2
|
||||
vporq $PAD,$T2,$T2
|
||||
vpaddq $D0hi,$D1lo,$D1lo
|
||||
|
||||
vpsrlq \$44,$D1lo,$tmp
|
||||
vpsllq \$8,$D1hi,$D1hi
|
||||
vpandq $mask44,$D1lo,$H1
|
||||
vpaddq $tmp,$D1hi,$D1hi
|
||||
|
||||
vpandq $mask44,$T1,$T0
|
||||
vpsrlq \$44,$T1,$T1
|
||||
vpsllq \$20,$T3,$T3
|
||||
vpaddq $D1hi,$D2lo,$D2lo
|
||||
|
||||
vpsrlq \$42,$D2lo,$tmp
|
||||
vpsllq \$10,$D2hi,$D2hi
|
||||
vpandq $mask42,$D2lo,$H2
|
||||
vpaddq $tmp,$D2hi,$D2hi
|
||||
|
||||
vpaddq $T2,$H2,$H2 # accumulate input
|
||||
vpaddq $D2hi,$H0,$H0
|
||||
vpsllq \$2,$D2hi,$D2hi
|
||||
|
||||
vpaddq $D2hi,$H0,$H0
|
||||
vporq $T3,$T1,$T1
|
||||
vpandq $mask44,$T1,$T1
|
||||
|
||||
vpsrlq \$44,$H0,$tmp # additional step
|
||||
vpandq $mask44,$H0,$H0
|
||||
|
||||
vpaddq $tmp,$H1,$H1
|
||||
|
||||
sub \$4,$len # len-=64
|
||||
jnz .Loop_vpmadd52_4x
|
||||
|
||||
.Ltail_vpmadd52_4x:
|
||||
vmovdqu64 128($ctx),$R2 # load all key powers
|
||||
vmovdqu64 160($ctx),$S1
|
||||
vmovdqu64 64($ctx),$R0
|
||||
vmovdqu64 96($ctx),$R1
|
||||
|
||||
.Ltail_vpmadd52_2x:
|
||||
vpsllq \$2,$R2,$S2 # S2 = R2*5*4
|
||||
vpaddq $R2,$S2,$S2
|
||||
vpsllq \$2,$S2,$S2
|
||||
|
||||
#vpaddq $T2,$H2,$H2 # accumulate input
|
||||
vpaddq $T0,$H0,$H0
|
||||
vpaddq $T1,$H1,$H1
|
||||
|
||||
vpxorq $D0lo,$D0lo,$D0lo
|
||||
vpmadd52luq $H2,$S1,$D0lo
|
||||
vpxorq $D0hi,$D0hi,$D0hi
|
||||
vpmadd52huq $H2,$S1,$D0hi
|
||||
vpxorq $D1lo,$D1lo,$D1lo
|
||||
vpmadd52luq $H2,$S2,$D1lo
|
||||
vpxorq $D1hi,$D1hi,$D1hi
|
||||
vpmadd52huq $H2,$S2,$D1hi
|
||||
vpxorq $D2lo,$D2lo,$D2lo
|
||||
vpmadd52luq $H2,$R0,$D2lo
|
||||
vpxorq $D2hi,$D2hi,$D2hi
|
||||
vpmadd52huq $H2,$R0,$D2hi
|
||||
|
||||
vpmadd52luq $H0,$R0,$D0lo
|
||||
vpmadd52huq $H0,$R0,$D0hi
|
||||
vpmadd52luq $H0,$R1,$D1lo
|
||||
vpmadd52huq $H0,$R1,$D1hi
|
||||
vpmadd52luq $H0,$R2,$D2lo
|
||||
vpmadd52huq $H0,$R2,$D2hi
|
||||
|
||||
vpmadd52luq $H1,$S2,$D0lo
|
||||
vpmadd52huq $H1,$S2,$D0hi
|
||||
vpmadd52luq $H1,$R0,$D1lo
|
||||
vpmadd52huq $H1,$R0,$D1hi
|
||||
vpmadd52luq $H1,$R1,$D2lo
|
||||
vpmadd52huq $H1,$R1,$D2hi
|
||||
|
||||
################################################################
|
||||
# horizontal addition
|
||||
|
||||
mov \$1,%eax
|
||||
kmovw %eax,%k1
|
||||
vpsrldq \$8,$D0lo,$T0
|
||||
vpsrldq \$8,$D0hi,$H0
|
||||
vpsrldq \$8,$D1lo,$T1
|
||||
vpsrldq \$8,$D1hi,$H1
|
||||
vpaddq $T0,$D0lo,$D0lo
|
||||
vpaddq $H0,$D0hi,$D0hi
|
||||
vpsrldq \$8,$D2lo,$T2
|
||||
vpsrldq \$8,$D2hi,$H2
|
||||
vpaddq $T1,$D1lo,$D1lo
|
||||
vpaddq $H1,$D1hi,$D1hi
|
||||
vpermq \$0x2,$D0lo,$T0
|
||||
vpermq \$0x2,$D0hi,$H0
|
||||
vpaddq $T2,$D2lo,$D2lo
|
||||
vpaddq $H2,$D2hi,$D2hi
|
||||
|
||||
vpermq \$0x2,$D1lo,$T1
|
||||
vpermq \$0x2,$D1hi,$H1
|
||||
vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
|
||||
vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
|
||||
vpermq \$0x2,$D2lo,$T2
|
||||
vpermq \$0x2,$D2hi,$H2
|
||||
vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
|
||||
vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
|
||||
vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
|
||||
vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
|
||||
|
||||
################################################################
|
||||
# partial reduction
|
||||
vpsrlq \$44,$D0lo,$tmp
|
||||
vpsllq \$8,$D0hi,$D0hi
|
||||
vpandq $mask44,$D0lo,$H0
|
||||
vpaddq $tmp,$D0hi,$D0hi
|
||||
|
||||
vpaddq $D0hi,$D1lo,$D1lo
|
||||
|
||||
vpsrlq \$44,$D1lo,$tmp
|
||||
vpsllq \$8,$D1hi,$D1hi
|
||||
vpandq $mask44,$D1lo,$H1
|
||||
vpaddq $tmp,$D1hi,$D1hi
|
||||
|
||||
vpaddq $D1hi,$D2lo,$D2lo
|
||||
|
||||
vpsrlq \$42,$D2lo,$tmp
|
||||
vpsllq \$10,$D2hi,$D2hi
|
||||
vpandq $mask42,$D2lo,$H2
|
||||
vpaddq $tmp,$D2hi,$D2hi
|
||||
|
||||
vpaddq $D2hi,$H0,$H0
|
||||
vpsllq \$2,$D2hi,$D2hi
|
||||
|
||||
vpaddq $D2hi,$H0,$H0
|
||||
|
||||
vpsrlq \$44,$H0,$tmp # additional step
|
||||
vpandq $mask44,$H0,$H0
|
||||
|
||||
vpaddq $tmp,$H1,$H1
|
||||
# at this point $len is
|
||||
# either 4*n+2 or 0...
|
||||
sub \$2,$len # len-=32
|
||||
ja .Lblocks_vpmadd52_4x_do
|
||||
|
||||
vmovq %x#$H0,0($ctx)
|
||||
vmovq %x#$H1,8($ctx)
|
||||
vmovq %x#$H2,16($ctx)
|
||||
|
||||
.Lno_data_vpmadd52_4x:
|
||||
ret
|
||||
.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.type poly1305_emit_base2_44,\@function,3
|
||||
.align 32
|
||||
|
@ -2920,6 +3385,13 @@ $code.=<<___;
|
|||
.quad 44,44,42,64
|
||||
.L2_44_shift_lft:
|
||||
.quad 8,8,10,64
|
||||
|
||||
.Lx_mask44:
|
||||
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
|
||||
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
|
||||
.Lx_mask42:
|
||||
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
|
||||
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
|
||||
___
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue