bsaes-x86_64.pl: optimize key conversion.

This commit is contained in:
Andy Polyakov 2012-03-16 21:44:19 +00:00
parent 442c9f13d4
commit f9ef874a21

View file

@ -65,12 +65,12 @@
# function is:
#
# conversion conversion/8x block
# Core 2 410 0.37
# Nehalem 310 0.35
# Atom 570 0.26
# Core 2 240 0.22
# Nehalem 180 0.20
# Atom 430 0.19
#
# The ratio values mean that 128-byte blocks will be processed
# 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
# etc. Then keep in mind that input sizes not divisible by 128 are
# *effectively* slower, especially shortest ones, e.g. consecutive
# 144-byte blocks are processed 44% slower than one would expect,
@ -85,6 +85,7 @@
#
# Core 2 11.0
# Nehalem 9.16
# Atom 20.9
#
# November 2011.
#
@ -754,7 +755,7 @@ _bsaes_encrypt8:
movdqa ($key), @XMM[9] # round 0 key
lea 0x10($key), $key
movdqa 0x60($const), @XMM[8] # .LM0SR
movdqa 0x50($const), @XMM[8] # .LM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
pshufb @XMM[8], @XMM[0]
@ -905,46 +906,82 @@ $code.=<<___;
.type _bsaes_key_convert,\@abi-omnipotent
.align 16
_bsaes_key_convert:
lea .LBS1(%rip), $const
lea .Lmasks(%rip), $const
movdqu ($inp), %xmm7 # load round 0 key
movdqa -0x10($const), %xmm8 # .LBS0
movdqa 0x00($const), %xmm9 # .LBS1
movdqa 0x10($const), %xmm10 # .LBS2
movdqa 0x40($const), %xmm13 # .LM0
movdqa 0x60($const), %xmm14 # .LNOT
movdqu 0x10($inp), %xmm6 # load round 1 key
lea 0x10($inp), $inp
movdqa 0x00($const), %xmm0 # 0x01...
movdqa 0x10($const), %xmm1 # 0x02...
movdqa 0x20($const), %xmm2 # 0x04...
movdqa 0x30($const), %xmm3 # 0x08...
movdqa 0x40($const), %xmm4 # .LM0
pcmpeqd %xmm5, %xmm5 # .LNOT
movdqu ($inp), %xmm6 # load round 1 key
movdqa %xmm7, ($out) # save round 0 key
lea 0x10($out), $out
dec $rounds
jmp .Lkey_loop
.align 16
.Lkey_loop:
pshufb %xmm13, %xmm6 # .LM0
movdqa %xmm6, %xmm7
___
&bitslice_key (map("%xmm$_",(0..7, 8..12)));
$code.=<<___;
pxor %xmm14, %xmm5 # "pnot"
pxor %xmm14, %xmm6
pxor %xmm14, %xmm0
pxor %xmm14, %xmm1
lea 0x10($inp), $inp
movdqa %xmm0, 0x00($out) # write bit-sliced round key
movdqa %xmm1, 0x10($out)
movdqa %xmm2, 0x20($out)
movdqa %xmm3, 0x30($out)
movdqa %xmm4, 0x40($out)
movdqa %xmm5, 0x50($out)
movdqa %xmm6, 0x60($out)
movdqa %xmm7, 0x70($out)
pshufb %xmm4, %xmm6 # .LM0
movdqa %xmm0, %xmm8
movdqa %xmm1, %xmm9
pand %xmm6, %xmm8
pand %xmm6, %xmm9
movdqa %xmm2, %xmm10
pcmpeqb %xmm0, %xmm8
psllq \$4, %xmm0 # 0x10...
movdqa %xmm3, %xmm11
pcmpeqb %xmm1, %xmm9
psllq \$4, %xmm1 # 0x20...
pand %xmm6, %xmm10
pand %xmm6, %xmm11
movdqa %xmm0, %xmm12
pcmpeqb %xmm2, %xmm10
psllq \$4, %xmm2 # 0x40...
movdqa %xmm1, %xmm13
pcmpeqb %xmm3, %xmm11
psllq \$4, %xmm3 # 0x80...
movdqa %xmm2, %xmm14
movdqa %xmm3, %xmm15
pxor %xmm5, %xmm8 # "pnot"
pxor %xmm5, %xmm9
pand %xmm6, %xmm12
pand %xmm6, %xmm13
movdqa %xmm8, 0x00($out) # write bit-sliced round key
pcmpeqb %xmm0, %xmm12
psrlq \$4, %xmm0 # 0x01...
movdqa %xmm9, 0x10($out)
pcmpeqb %xmm1, %xmm13
psrlq \$4, %xmm1 # 0x02...
lea 0x10($inp), $inp
pand %xmm6, %xmm14
pand %xmm6, %xmm15
movdqa %xmm10, 0x20($out)
pcmpeqb %xmm2, %xmm14
psrlq \$4, %xmm2 # 0x04...
movdqa %xmm11, 0x30($out)
pcmpeqb %xmm3, %xmm15
psrlq \$4, %xmm3 # 0x08...
movdqu ($inp), %xmm6 # load next round key
pxor %xmm5, %xmm13 # "pnot"
pxor %xmm5, %xmm14
movdqa %xmm12, 0x40($out)
movdqa %xmm13, 0x50($out)
movdqa %xmm14, 0x60($out)
movdqa %xmm15, 0x70($out)
lea 0x80($out),$out
movdqu ($inp), %xmm6 # load next round key
dec $rounds
jnz .Lkey_loop
movdqa 0x70($const), %xmm7 # .L63
movdqa 0x50($const), %xmm7 # .L63
#movdqa %xmm6, ($out) # don't save last round key
ret
.size _bsaes_key_convert,.-_bsaes_key_convert
@ -2800,14 +2837,8 @@ _bsaes_const:
.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
.LSRM0:
.quad 0x0304090e00050a0f, 0x01060b0c0207080d
.LM0:
.quad 0x02060a0e03070b0f, 0x0004080c0105090d
.LM0SR:
.quad 0x0a0e02060f03070b, 0x0004080c05090d01
.LNOT: # magic constants
.quad 0xffffffffffffffff, 0xffffffffffffffff
.L63:
.quad 0x6363636363636363, 0x6363636363636363
.LSWPUP: # byte-swap upper dword
.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
.LSWPUPM0SR:
@ -2830,6 +2861,15 @@ _bsaes_const:
.quad 0x0000000000000000, 0x0000000800000000
.Lxts_magic:
.long 0x87,0,1,0
.Lmasks:
.quad 0x0101010101010101, 0x0101010101010101
.quad 0x0202020202020202, 0x0202020202020202
.quad 0x0404040404040404, 0x0404040404040404
.quad 0x0808080808080808, 0x0808080808080808
.LM0:
.quad 0x02060a0e03070b0f, 0x0004080c0105090d
.L63:
.quad 0x6363636363636363, 0x6363636363636363
.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
.align 64
.size _bsaes_const,.-_bsaes_const