aes/asm/vpaes-ppc.pl: eliminate overhung stores in misaligned cases.
Reviewed-by: Richard Levitte <levitte@openssl.org>
(cherry picked from commit ce24d2ed23
)
This commit is contained in:
parent
bef7693860
commit
bc2ab6ed2f
1 changed files with 137 additions and 63 deletions
|
@ -337,24 +337,27 @@ Lenc_entry:
|
||||||
addi $inp, $inp, 15 # 15 is not a typo
|
addi $inp, $inp, 15 # 15 is not a typo
|
||||||
?lvsr $outperm, 0, $out
|
?lvsr $outperm, 0, $out
|
||||||
?lvsl $keyperm, 0, $key # prepare for unaligned access
|
?lvsl $keyperm, 0, $key # prepare for unaligned access
|
||||||
vnor $outmask, v7, v7 # 0xff..ff
|
|
||||||
lvx $inptail, 0, $inp # redundant in aligned case
|
lvx $inptail, 0, $inp # redundant in aligned case
|
||||||
?vperm $outmask, v7, $outmask, $outperm
|
|
||||||
lvx $outhead, 0, $out
|
|
||||||
?vperm v0, v0, $inptail, $inpperm
|
?vperm v0, v0, $inptail, $inpperm
|
||||||
|
|
||||||
bl _vpaes_encrypt_core
|
bl _vpaes_encrypt_core
|
||||||
|
|
||||||
vperm v0, v0, v0, $outperm # rotate right/left
|
andi. r8, $out, 15
|
||||||
vsel v1, $outhead, v0, $outmask
|
li r9, 16
|
||||||
vmr $outhead, v0
|
beq Lenc_out_aligned
|
||||||
stvx v1, 0, $out
|
|
||||||
addi $out, $out, 15 # 15 is not a typo
|
|
||||||
########
|
|
||||||
|
|
||||||
lvx v1, 0, $out # redundant in aligned case
|
vperm v0, v0, v0, $outperm # rotate right/left
|
||||||
vsel v1, $outhead, v1, $outmask
|
mtctr r9
|
||||||
stvx v1, 0, $out
|
Lenc_out_unaligned:
|
||||||
|
stvebx v0, 0, $out
|
||||||
|
addi $out, $out, 1
|
||||||
|
bdnz Lenc_out_unaligned
|
||||||
|
b Lenc_done
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
Lenc_out_aligned:
|
||||||
|
stvx v0, 0, $out
|
||||||
|
Lenc_done:
|
||||||
|
|
||||||
li r10,`15+6*$SIZE_T`
|
li r10,`15+6*$SIZE_T`
|
||||||
li r11,`31+6*$SIZE_T`
|
li r11,`31+6*$SIZE_T`
|
||||||
|
@ -566,24 +569,27 @@ Ldec_entry:
|
||||||
addi $inp, $inp, 15 # 15 is not a typo
|
addi $inp, $inp, 15 # 15 is not a typo
|
||||||
?lvsr $outperm, 0, $out
|
?lvsr $outperm, 0, $out
|
||||||
?lvsl $keyperm, 0, $key
|
?lvsl $keyperm, 0, $key
|
||||||
vnor $outmask, v7, v7 # 0xff..ff
|
|
||||||
lvx $inptail, 0, $inp # redundant in aligned case
|
lvx $inptail, 0, $inp # redundant in aligned case
|
||||||
?vperm $outmask, v7, $outmask, $outperm
|
|
||||||
lvx $outhead, 0, $out
|
|
||||||
?vperm v0, v0, $inptail, $inpperm
|
?vperm v0, v0, $inptail, $inpperm
|
||||||
|
|
||||||
bl _vpaes_decrypt_core
|
bl _vpaes_decrypt_core
|
||||||
|
|
||||||
vperm v0, v0, v0, $outperm # rotate right/left
|
andi. r8, $out, 15
|
||||||
vsel v1, $outhead, v0, $outmask
|
li r9, 16
|
||||||
vmr $outhead, v0
|
beq Ldec_out_aligned
|
||||||
stvx v1, 0, $out
|
|
||||||
addi $out, $out, 15 # 15 is not a typo
|
|
||||||
########
|
|
||||||
|
|
||||||
lvx v1, 0, $out # redundant in aligned case
|
vperm v0, v0, v0, $outperm # rotate right/left
|
||||||
vsel v1, $outhead, v1, $outmask
|
mtctr r9
|
||||||
stvx v1, 0, $out
|
Ldec_out_unaligned:
|
||||||
|
stvebx v0, 0, $out
|
||||||
|
addi $out, $out, 1
|
||||||
|
bdnz Ldec_out_unaligned
|
||||||
|
b Ldec_done
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
Ldec_out_aligned:
|
||||||
|
stvx v0, 0, $out
|
||||||
|
Ldec_done:
|
||||||
|
|
||||||
li r10,`15+6*$SIZE_T`
|
li r10,`15+6*$SIZE_T`
|
||||||
li r11,`31+6*$SIZE_T`
|
li r11,`31+6*$SIZE_T`
|
||||||
|
@ -658,11 +664,11 @@ Ldec_entry:
|
||||||
$PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
|
$PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
|
||||||
|
|
||||||
and r30, r5, r9 # copy length&-16
|
and r30, r5, r9 # copy length&-16
|
||||||
|
andi. r9, $out, 15 # is $out aligned?
|
||||||
mr r5, r6 # copy pointer to key
|
mr r5, r6 # copy pointer to key
|
||||||
mr r31, r7 # copy pointer to iv
|
mr r31, r7 # copy pointer to iv
|
||||||
blt Lcbc_abort
|
|
||||||
cmpwi r8, 0 # test direction
|
|
||||||
li r6, -1
|
li r6, -1
|
||||||
|
mcrf cr1, cr0 # put aside $out alignment flag
|
||||||
mr r7, r12 # copy vrsave
|
mr r7, r12 # copy vrsave
|
||||||
mtspr 256, r6 # preserve all AltiVec registers
|
mtspr 256, r6 # preserve all AltiVec registers
|
||||||
|
|
||||||
|
@ -672,6 +678,7 @@ Ldec_entry:
|
||||||
lvx v25, r9, r31
|
lvx v25, r9, r31
|
||||||
?vperm v24, v24, v25, $inpperm
|
?vperm v24, v24, v25, $inpperm
|
||||||
|
|
||||||
|
cmpwi r8, 0 # test direction
|
||||||
neg r8, $inp # prepare for unaligned access
|
neg r8, $inp # prepare for unaligned access
|
||||||
vxor v7, v7, v7
|
vxor v7, v7, v7
|
||||||
?lvsl $keyperm, 0, $key
|
?lvsl $keyperm, 0, $key
|
||||||
|
@ -681,13 +688,37 @@ Ldec_entry:
|
||||||
lvx $inptail, 0, $inp
|
lvx $inptail, 0, $inp
|
||||||
?vperm $outmask, v7, $outmask, $outperm
|
?vperm $outmask, v7, $outmask, $outperm
|
||||||
addi $inp, $inp, 15 # 15 is not a typo
|
addi $inp, $inp, 15 # 15 is not a typo
|
||||||
lvx $outhead, 0, $out
|
|
||||||
|
|
||||||
beq Lcbc_decrypt
|
beq Lcbc_decrypt
|
||||||
|
|
||||||
bl _vpaes_encrypt_preheat
|
bl _vpaes_encrypt_preheat
|
||||||
li r0, 16
|
li r0, 16
|
||||||
|
|
||||||
|
beq cr1, Lcbc_enc_loop # $out is aligned
|
||||||
|
|
||||||
|
vmr v0, $inptail
|
||||||
|
lvx $inptail, 0, $inp
|
||||||
|
addi $inp, $inp, 16
|
||||||
|
?vperm v0, v0, $inptail, $inpperm
|
||||||
|
vxor v0, v0, v24 # ^= iv
|
||||||
|
|
||||||
|
bl _vpaes_encrypt_core
|
||||||
|
|
||||||
|
andi. r8, $out, 15
|
||||||
|
vmr v24, v0 # put aside iv
|
||||||
|
sub r9, $out, r8
|
||||||
|
vperm $outhead, v0, v0, $outperm # rotate right/left
|
||||||
|
|
||||||
|
Lcbc_enc_head:
|
||||||
|
stvebx $outhead, r8, r9
|
||||||
|
cmpwi r8, 15
|
||||||
|
addi r8, r8, 1
|
||||||
|
bne Lcbc_enc_head
|
||||||
|
|
||||||
|
sub. r30, r30, r0 # len -= 16
|
||||||
|
addi $out, $out, 16
|
||||||
|
beq Lcbc_unaligned_done
|
||||||
|
|
||||||
Lcbc_enc_loop:
|
Lcbc_enc_loop:
|
||||||
vmr v0, $inptail
|
vmr v0, $inptail
|
||||||
lvx $inptail, 0, $inp
|
lvx $inptail, 0, $inp
|
||||||
|
@ -713,6 +744,32 @@ Lcbc_decrypt:
|
||||||
bl _vpaes_decrypt_preheat
|
bl _vpaes_decrypt_preheat
|
||||||
li r0, 16
|
li r0, 16
|
||||||
|
|
||||||
|
beq cr1, Lcbc_dec_loop # $out is aligned
|
||||||
|
|
||||||
|
vmr v0, $inptail
|
||||||
|
lvx $inptail, 0, $inp
|
||||||
|
addi $inp, $inp, 16
|
||||||
|
?vperm v0, v0, $inptail, $inpperm
|
||||||
|
vmr v25, v0 # put aside input
|
||||||
|
|
||||||
|
bl _vpaes_decrypt_core
|
||||||
|
|
||||||
|
andi. r8, $out, 15
|
||||||
|
vxor v0, v0, v24 # ^= iv
|
||||||
|
vmr v24, v25
|
||||||
|
sub r9, $out, r8
|
||||||
|
vperm $outhead, v0, v0, $outperm # rotate right/left
|
||||||
|
|
||||||
|
Lcbc_dec_head:
|
||||||
|
stvebx $outhead, r8, r9
|
||||||
|
cmpwi r8, 15
|
||||||
|
addi r8, r8, 1
|
||||||
|
bne Lcbc_dec_head
|
||||||
|
|
||||||
|
sub. r30, r30, r0 # len -= 16
|
||||||
|
addi $out, $out, 16
|
||||||
|
beq Lcbc_unaligned_done
|
||||||
|
|
||||||
Lcbc_dec_loop:
|
Lcbc_dec_loop:
|
||||||
vmr v0, $inptail
|
vmr v0, $inptail
|
||||||
lvx $inptail, 0, $inp
|
lvx $inptail, 0, $inp
|
||||||
|
@ -733,23 +790,29 @@ Lcbc_dec_loop:
|
||||||
bne Lcbc_dec_loop
|
bne Lcbc_dec_loop
|
||||||
|
|
||||||
Lcbc_done:
|
Lcbc_done:
|
||||||
addi $out, $out, -1
|
beq cr1, Lcbc_write_iv # $out is aligned
|
||||||
lvx v1, 0, $out # redundant in aligned case
|
|
||||||
vsel v1, $outhead, v1, $outmask
|
|
||||||
stvx v1, 0, $out
|
|
||||||
|
|
||||||
|
Lcbc_unaligned_done:
|
||||||
|
andi. r8, $out, 15
|
||||||
|
sub $out, $out, r8
|
||||||
|
li r9, 0
|
||||||
|
Lcbc_tail:
|
||||||
|
stvebx $outhead, r9, $out
|
||||||
|
addi r9, r9, 1
|
||||||
|
cmpw r9, r8
|
||||||
|
bne Lcbc_tail
|
||||||
|
|
||||||
|
Lcbc_write_iv:
|
||||||
neg r8, r31 # write [potentially unaligned] iv
|
neg r8, r31 # write [potentially unaligned] iv
|
||||||
|
li r10, 4
|
||||||
?lvsl $outperm, 0, r8
|
?lvsl $outperm, 0, r8
|
||||||
li r6, 15
|
li r11, 8
|
||||||
vnor $outmask, v7, v7 # 0xff..ff
|
li r12, 12
|
||||||
?vperm $outmask, v7, $outmask, $outperm
|
|
||||||
lvx $outhead, 0, r31
|
|
||||||
vperm v24, v24, v24, $outperm # rotate right/left
|
vperm v24, v24, v24, $outperm # rotate right/left
|
||||||
vsel v0, $outhead, v24, $outmask
|
stvewx v24, 0, r31 # ivp is at least 32-bit aligned
|
||||||
lvx v1, r6, r31
|
stvewx v24, r10, r31
|
||||||
stvx v0, 0, r31
|
stvewx v24, r11, r31
|
||||||
vsel v1, v24, v1, $outmask
|
stvewx v24, r12, r31
|
||||||
stvx v1, r6, r31
|
|
||||||
|
|
||||||
mtspr 256, r7 # restore vrsave
|
mtspr 256, r7 # restore vrsave
|
||||||
li r10,`15+6*$SIZE_T`
|
li r10,`15+6*$SIZE_T`
|
||||||
|
@ -872,18 +935,21 @@ _vpaes_schedule_core:
|
||||||
|
|
||||||
# encrypting, output zeroth round key after transform
|
# encrypting, output zeroth round key after transform
|
||||||
li r8, 0x30 # mov \$0x30,%r8d
|
li r8, 0x30 # mov \$0x30,%r8d
|
||||||
addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
|
li r9, 4
|
||||||
|
li r10, 8
|
||||||
|
li r11, 12
|
||||||
|
|
||||||
?lvsr $outperm, 0, $out # prepare for unaligned access
|
?lvsr $outperm, 0, $out # prepare for unaligned access
|
||||||
vnor $outmask, v9, v9 # 0xff..ff
|
vnor $outmask, v9, v9 # 0xff..ff
|
||||||
lvx $outhead, 0, $out
|
|
||||||
?vperm $outmask, v9, $outmask, $outperm
|
?vperm $outmask, v9, $outmask, $outperm
|
||||||
|
|
||||||
#stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
|
#stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
|
||||||
vperm v1, v0, v0, $outperm # rotate right/left
|
vperm $outhead, v0, v0, $outperm # rotate right/left
|
||||||
vsel v2, $outhead, v1, $outmask
|
stvewx $outhead, 0, $out # some are superfluous
|
||||||
vmr $outhead, v1
|
stvewx $outhead, r9, $out
|
||||||
stvx v2, 0, $out
|
stvewx $outhead, r10, $out
|
||||||
|
addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
|
||||||
|
stvewx $outhead, r11, $out
|
||||||
b Lschedule_go
|
b Lschedule_go
|
||||||
|
|
||||||
Lschedule_am_decrypting:
|
Lschedule_am_decrypting:
|
||||||
|
@ -893,20 +959,24 @@ Lschedule_am_decrypting:
|
||||||
addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
|
addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
|
||||||
# decrypting, output zeroth round key after shiftrows
|
# decrypting, output zeroth round key after shiftrows
|
||||||
lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
|
lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
|
||||||
|
li r9, 4
|
||||||
|
li r10, 8
|
||||||
|
li r11, 12
|
||||||
vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
|
vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
|
||||||
|
|
||||||
neg r0, $out # prepare for unaligned access
|
neg r0, $out # prepare for unaligned access
|
||||||
?lvsl $outperm, 0, r0
|
?lvsl $outperm, 0, r0
|
||||||
addi $out, $out, 15 # 15 is not typo
|
|
||||||
vnor $outmask, v9, v9 # 0xff..ff
|
vnor $outmask, v9, v9 # 0xff..ff
|
||||||
lvx $outhead, 0, $out
|
|
||||||
?vperm $outmask, $outmask, v9, $outperm
|
?vperm $outmask, $outmask, v9, $outperm
|
||||||
|
|
||||||
#stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
|
#stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
|
||||||
vperm v4, v4, v4, $outperm # rotate right/left
|
vperm $outhead, v4, v4, $outperm # rotate right/left
|
||||||
vsel v2, $outhead, v4, $outmask
|
stvewx $outhead, 0, $out # some are superfluous
|
||||||
vmr $outhead, v4
|
stvewx $outhead, r9, $out
|
||||||
stvx v2, 0, $out
|
stvewx $outhead, r10, $out
|
||||||
|
addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
|
||||||
|
stvewx $outhead, r11, $out
|
||||||
|
addi $out, $out, 15 # 15 is not typo
|
||||||
xori r8, r8, 0x30 # xor \$0x30, %r8
|
xori r8, r8, 0x30 # xor \$0x30, %r8
|
||||||
|
|
||||||
Lschedule_go:
|
Lschedule_go:
|
||||||
|
@ -1038,14 +1108,15 @@ Lschedule_mangle_last:
|
||||||
|
|
||||||
#stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
|
#stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
|
||||||
vperm v0, v0, v0, $outperm # rotate right/left
|
vperm v0, v0, v0, $outperm # rotate right/left
|
||||||
|
li r10, 4
|
||||||
vsel v2, $outhead, v0, $outmask
|
vsel v2, $outhead, v0, $outmask
|
||||||
vmr $outhead, v0
|
li r11, 8
|
||||||
stvx v2, 0, $out
|
stvx v2, 0, $out
|
||||||
|
li r12, 12
|
||||||
addi $out, $out, 15 # 15 is not typo
|
stvewx v0, 0, $out # some (or all) are redundant
|
||||||
lvx v1, 0, $out # redundant in aligned case
|
stvewx v0, r10, $out
|
||||||
vsel v1, $outhead, v1, $outmask
|
stvewx v0, r11, $out
|
||||||
stvx v1, 0, $out
|
stvewx v0, r12, $out
|
||||||
b Lschedule_mangle_done
|
b Lschedule_mangle_done
|
||||||
|
|
||||||
.align 4
|
.align 4
|
||||||
|
@ -1057,15 +1128,18 @@ Lschedule_mangle_last_dec:
|
||||||
bl _vpaes_schedule_transform # output transform
|
bl _vpaes_schedule_transform # output transform
|
||||||
|
|
||||||
#stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
|
#stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
|
||||||
|
addi r9, $out, -15 # -15 is not typo
|
||||||
vperm v0, v0, v0, $outperm # rotate right/left
|
vperm v0, v0, v0, $outperm # rotate right/left
|
||||||
|
li r10, 4
|
||||||
vsel v2, $outhead, v0, $outmask
|
vsel v2, $outhead, v0, $outmask
|
||||||
vmr $outhead, v0
|
li r11, 8
|
||||||
stvx v2, 0, $out
|
stvx v2, 0, $out
|
||||||
|
li r12, 12
|
||||||
|
stvewx v0, 0, r9 # some (or all) are redundant
|
||||||
|
stvewx v0, r10, r9
|
||||||
|
stvewx v0, r11, r9
|
||||||
|
stvewx v0, r12, r9
|
||||||
|
|
||||||
addi $out, $out, -15 # -15 is not typo
|
|
||||||
lvx v1, 0, $out # redundant in aligned case
|
|
||||||
vsel v1, $outhead, v1, $outmask
|
|
||||||
stvx v1, 0, $out
|
|
||||||
|
|
||||||
Lschedule_mangle_done:
|
Lschedule_mangle_done:
|
||||||
mtlr r7
|
mtlr r7
|
||||||
|
|
Loading…
Reference in a new issue