e_padlock-x86[_64].pl: better understanding of prefetch errata and proper
workaround.
This commit is contained in:
parent
884c580e05
commit
ed998634cd
2 changed files with 203 additions and 77 deletions
|
@ -37,7 +37,7 @@ require "x86asm.pl";
|
|||
|
||||
&asm_init($ARGV[0],$0);
|
||||
|
||||
%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata
|
||||
%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata
|
||||
$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16
|
||||
|
||||
$ctx="edx";
|
||||
|
@ -188,10 +188,6 @@ my ($mode,$opcode) = @_;
|
|||
&movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
|
||||
} else {
|
||||
&xor ("ebx","ebx");
|
||||
if ($PADLOCK_MARGIN{$mode}) {
|
||||
&cmp ($len,$PADLOCK_MARGIN{$mode});
|
||||
&jbe (&label("${mode}_short"));
|
||||
}
|
||||
&test (&DWP(0,$ctx),1<<5); # align bit in control word
|
||||
&jnz (&label("${mode}_aligned"));
|
||||
&test ($out,0x0f);
|
||||
|
@ -212,7 +208,27 @@ my ($mode,$opcode) = @_;
|
|||
&neg ("eax");
|
||||
&and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK
|
||||
&lea ("esp",&DWP(0,"eax","ebp")); # alloca
|
||||
&mov ("eax",$PADLOCK_CHUNK);
|
||||
&cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK
|
||||
&mov ("eax","ebp");
|
||||
&and ("ebp",-16);
|
||||
&and ("esp",-16);
|
||||
&mov (&DWP(16,"ebp"),"eax");
|
||||
if ($PADLOCK_PREFETCH{$mode}) {
|
||||
&cmp ($len,$chunk);
|
||||
&ja (&label("${mode}_loop"));
|
||||
&mov ("eax",$inp); # check if prefetch crosses page
|
||||
&cmp ("ebp","esp");
|
||||
&cmove ("eax",$out);
|
||||
&add ("eax",$len);
|
||||
&neg ("eax");
|
||||
&and ("eax",0xfff); # distance to page boundary
|
||||
&cmp ("eax",$PADLOCK_PREFETCH{$mode});
|
||||
&mov ("eax",-$PADLOCK_PREFETCH{$mode});
|
||||
&cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1
|
||||
&and ($chunk,"eax");
|
||||
&jz (&label("${mode}_unaligned_tail"));
|
||||
}
|
||||
&jmp (&label("${mode}_loop"));
|
||||
|
||||
&set_label("${mode}_loop",16);
|
||||
|
@ -276,8 +292,8 @@ my ($mode,$opcode) = @_;
|
|||
&test ($out,0x0f);
|
||||
&jz (&label("${mode}_out_aligned"));
|
||||
&mov ($len,$chunk);
|
||||
&shr ($len,2);
|
||||
&lea ($inp,&DWP(0,"esp"));
|
||||
&shr ($len,2);
|
||||
&data_byte(0xf3,0xa5); # rep movsl
|
||||
&sub ($out,$chunk);
|
||||
&set_label("${mode}_out_aligned");
|
||||
|
@ -288,7 +304,30 @@ my ($mode,$opcode) = @_;
|
|||
&add ($inp,$chunk);
|
||||
&sub ($len,$chunk);
|
||||
&mov ($chunk,$PADLOCK_CHUNK);
|
||||
if (!$PADLOCK_PREFETCH{$mode}) {
|
||||
&jnz (&label("${mode}_loop"));
|
||||
} else {
|
||||
&jz (&label("${mode}_break"));
|
||||
&cmp ($len,$chunk);
|
||||
&jae (&label("${mode}_loop"));
|
||||
|
||||
&set_label("${mode}_unaligned_tail");
|
||||
&xor ("eax","eax");
|
||||
&cmp ("esp","ebp");
|
||||
&cmove ("eax",$len);
|
||||
&sub ("esp","eax"); # alloca
|
||||
&mov ("eax", $out); # save parameters
|
||||
&mov ($chunk,$len);
|
||||
&shr ($len,2);
|
||||
&lea ($out,&DWP(0,"esp"));
|
||||
&data_byte(0xf3,0xa5); # rep movsl
|
||||
&mov ($inp,"esp");
|
||||
&mov ($out,"eax"); # restore parameters
|
||||
&mov ($len,$chunk);
|
||||
&jmp (&label("${mode}_loop"));
|
||||
|
||||
&set_label("${mode}_break",16);
|
||||
}
|
||||
if ($mode ne "ctr32") {
|
||||
&cmp ("esp","ebp");
|
||||
&je (&label("${mode}_done"));
|
||||
|
@ -302,28 +341,24 @@ my ($mode,$opcode) = @_;
|
|||
&ja (&label("${mode}_bzero"));
|
||||
|
||||
&set_label("${mode}_done");
|
||||
&mov ("ebp",&DWP(16,"ebp"));
|
||||
&lea ("esp",&DWP(24,"ebp"));
|
||||
if ($mode ne "ctr32") {
|
||||
&jmp (&label("${mode}_exit"));
|
||||
|
||||
&set_label("${mode}_short",16);
|
||||
&xor ("eax","eax");
|
||||
&lea ("ebp",&DWP(-24,"esp"));
|
||||
&sub ("eax",$len);
|
||||
&lea ("esp",&DWP(0,"eax","ebp"));
|
||||
&and ("esp",-16);
|
||||
&xor ($chunk,$chunk);
|
||||
&set_label("${mode}_short_copy");
|
||||
&movups ("xmm0",&QWP(0,$inp,$chunk));
|
||||
&lea ($chunk,&DWP(16,$chunk));
|
||||
&cmp ($len,$chunk);
|
||||
&movaps (&QWP(-16,"esp",$chunk),"xmm0");
|
||||
&ja (&label("${mode}_short_copy"));
|
||||
&mov ($inp,"esp");
|
||||
&mov ($chunk,$len);
|
||||
&jmp (&label("${mode}_loop"));
|
||||
|
||||
&set_label("${mode}_aligned",16);
|
||||
if ($PADLOCK_PREFETCH{$mode}) {
|
||||
&lea ("ebp",&DWP(0,$inp,$len));
|
||||
&neg ("ebp");
|
||||
&and ("ebp",0xfff); # distance to page boundary
|
||||
&xor ("eax","eax");
|
||||
&cmp ("ebp",$PADLOCK_PREFETCH{$mode});
|
||||
&mov ("ebp",$PADLOCK_PREFETCH{$mode}-1);
|
||||
&cmovae ("ebp","eax");
|
||||
&and ("ebp",$len); # remainder
|
||||
&sub ($len,"ebp");
|
||||
&jz (&label("${mode}_aligned_tail"));
|
||||
}
|
||||
&lea ("eax",&DWP(-16,$ctx)); # ivp
|
||||
&lea ("ebx",&DWP(16,$ctx)); # key
|
||||
&shr ($len,4); # len/=AES_BLOCK_SIZE
|
||||
|
@ -332,6 +367,29 @@ my ($mode,$opcode) = @_;
|
|||
&movaps ("xmm0",&QWP(0,"eax"));
|
||||
&movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv
|
||||
}
|
||||
if ($PADLOCK_PREFETCH{$mode}) {
|
||||
&test ("ebp","ebp");
|
||||
&jz (&label("${mode}_exit"));
|
||||
|
||||
&set_label("${mode}_aligned_tail");
|
||||
&mov ($len,"ebp");
|
||||
&lea ("ebp",&DWP(-24,"esp"));
|
||||
&mov ("esp","ebp");
|
||||
&mov ("eax","ebp");
|
||||
&sub ("esp",$len);
|
||||
&and ("ebp",-16);
|
||||
&and ("esp",-16);
|
||||
&mov (&DWP(16,"ebp"),"eax");
|
||||
&mov ("eax", $out); # save parameters
|
||||
&mov ($chunk,$len);
|
||||
&shr ($len,2);
|
||||
&lea ($out,&DWP(0,"esp"));
|
||||
&data_byte(0xf3,0xa5); # rep movsl
|
||||
&mov ($inp,"esp");
|
||||
&mov ($out,"eax"); # restore parameters
|
||||
&mov ($len,$chunk);
|
||||
&jmp (&label("${mode}_loop"));
|
||||
}
|
||||
&set_label("${mode}_exit"); }
|
||||
&mov ("eax",1);
|
||||
&lea ("esp",&DWP(4,"esp")); # popf
|
||||
|
|
|
@ -27,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output";
|
|||
|
||||
$code=".text\n";
|
||||
|
||||
%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata
|
||||
%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
|
||||
$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
|
||||
|
||||
$ctx="%rdx";
|
||||
|
@ -285,17 +285,6 @@ padlock_${mode}_encrypt:
|
|||
lea 16($ctx),$ctx # control word
|
||||
xor %eax,%eax
|
||||
xor %ebx,%ebx
|
||||
___
|
||||
# Formally speaking correct condtion is $len<=$margin and $inp+$margin
|
||||
# crosses page boundary [and next page is unreadable]. But $inp can
|
||||
# be unaligned in which case data can be copied to $out if latter is
|
||||
# aligned, in which case $out+$margin has to be checked. Covering all
|
||||
# cases appears more complicated than just copying short input...
|
||||
$code.=<<___ if ($PADLOCK_MARGIN{$mode});
|
||||
cmp \$$PADLOCK_MARGIN{$mode},$len
|
||||
jbe .L${mode}_short
|
||||
___
|
||||
$code.=<<___;
|
||||
testl \$`1<<5`,($ctx) # align bit in control word
|
||||
jnz .L${mode}_aligned
|
||||
test \$0x0f,$out
|
||||
|
@ -315,6 +304,8 @@ $code.=<<___;
|
|||
neg %rax
|
||||
and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
|
||||
lea (%rax,%rbp),%rsp
|
||||
mov \$$PADLOCK_CHUNK,%rax
|
||||
cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
|
||||
___
|
||||
$code.=<<___ if ($mode eq "ctr32");
|
||||
.L${mode}_reenter:
|
||||
|
@ -322,10 +313,27 @@ $code.=<<___ if ($mode eq "ctr32");
|
|||
bswap %eax
|
||||
neg %eax
|
||||
and \$`$PADLOCK_CHUNK/16-1`,%eax
|
||||
jz .L${mode}_loop
|
||||
mov \$$PADLOCK_CHUNK,$chunk
|
||||
shl \$4,%eax
|
||||
cmovz $chunk,%rax
|
||||
cmp %rax,$len
|
||||
cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
|
||||
cmovbe $len,$chunk
|
||||
___
|
||||
$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
|
||||
cmp $chunk,$len
|
||||
ja .L${mode}_loop
|
||||
mov $inp,%rax # check if prefetch crosses page
|
||||
cmp %rsp,%rbp
|
||||
cmove $out,%rax
|
||||
add $len,%rax
|
||||
neg %rax
|
||||
and \$0xfff,%rax # distance to page boundary
|
||||
cmp \$$PADLOCK_PREFETCH{$mode},%rax
|
||||
mov \$-$PADLOCK_PREFETCH{$mode},%rax
|
||||
cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
|
||||
and %rax,$chunk
|
||||
jz .L${mode}_unaligned_tail
|
||||
___
|
||||
$code.=<<___;
|
||||
jmp .L${mode}_loop
|
||||
|
@ -360,12 +368,12 @@ ___
|
|||
$code.=<<___ if ($mode eq "ctr32");
|
||||
mov -4($ctx),%eax # pull 32-bit counter
|
||||
test \$0xffff0000,%eax
|
||||
jnz .L${mode}_no_corr
|
||||
jnz .L${mode}_no_carry
|
||||
bswap %eax
|
||||
add \$0x10000,%eax
|
||||
bswap %eax
|
||||
mov %eax,-4($ctx)
|
||||
.L${mode}_no_corr:
|
||||
.L${mode}_no_carry:
|
||||
___
|
||||
$code.=<<___;
|
||||
mov %r8,$out # restore paramters
|
||||
|
@ -373,8 +381,8 @@ $code.=<<___;
|
|||
test \$0x0f,$out
|
||||
jz .L${mode}_out_aligned
|
||||
mov $chunk,$len
|
||||
shr \$3,$len
|
||||
lea (%rsp),$inp
|
||||
shr \$3,$len
|
||||
.byte 0xf3,0x48,0xa5 # rep movsq
|
||||
sub $chunk,$out
|
||||
.L${mode}_out_aligned:
|
||||
|
@ -384,9 +392,52 @@ $code.=<<___;
|
|||
add $chunk,$inp
|
||||
sub $chunk,$len
|
||||
mov \$$PADLOCK_CHUNK,$chunk
|
||||
___
|
||||
if (!$PADLOCK_PREFETCH{$mode}) {
|
||||
$code.=<<___;
|
||||
jnz .L${mode}_loop
|
||||
|
||||
___
|
||||
} else {
|
||||
$code.=<<___;
|
||||
jz .L${mode}_break
|
||||
cmp $chunk,$len
|
||||
jae .L${mode}_loop
|
||||
___
|
||||
$code.=<<___ if ($mode eq "ctr32");
|
||||
mov $len,$chunk
|
||||
mov $inp,%rax # check if prefetch crosses page
|
||||
cmp %rsp,%rbp
|
||||
cmove $out,%rax
|
||||
add $len,%rax
|
||||
neg %rax
|
||||
and \$0xfff,%rax # distance to page boundary
|
||||
cmp \$$PADLOCK_PREFETCH{$mode},%rax
|
||||
mov \$-$PADLOCK_PREFETCH{$mode},%rax
|
||||
cmovae $chunk,%rax
|
||||
and %rax,$chunk
|
||||
jnz .L${mode}_loop
|
||||
___
|
||||
$code.=<<___;
|
||||
.L${mode}_unaligned_tail:
|
||||
xor %eax,%eax
|
||||
cmp %rsp,%rbp
|
||||
cmove $len,%rax
|
||||
mov $out,%r8 # save parameters
|
||||
mov $len,$chunk
|
||||
sub %rax,%rsp # alloca
|
||||
shr \$3,$len
|
||||
lea (%rsp),$out
|
||||
.byte 0xf3,0x48,0xa5 # rep movsq
|
||||
mov %rsp,$inp
|
||||
mov %r8, $out # restore parameters
|
||||
mov $chunk,$len
|
||||
jmp .L${mode}_loop
|
||||
.align 16
|
||||
.L${mode}_break:
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
cmp %rbp,%rsp
|
||||
je .L${mode}_done
|
||||
|
||||
pxor %xmm0,%xmm0
|
||||
|
@ -400,47 +451,59 @@ $code.=<<___;
|
|||
.L${mode}_done:
|
||||
lea (%rbp),%rsp
|
||||
jmp .L${mode}_exit
|
||||
___
|
||||
$code.=<<___ if ($PADLOCK_MARGIN{$mode});
|
||||
.align 16
|
||||
.L${mode}_short:
|
||||
mov %rsp,%rbp
|
||||
sub $len,%rsp
|
||||
xor $chunk,$chunk
|
||||
.L${mode}_short_copy:
|
||||
movups ($inp,$chunk),%xmm0
|
||||
lea 16($chunk),$chunk
|
||||
cmp $chunk,$len
|
||||
movaps %xmm0,-16(%rsp,$chunk)
|
||||
ja .L${mode}_short_copy
|
||||
mov %rsp,$inp
|
||||
mov $len,$chunk
|
||||
jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
|
||||
___
|
||||
$code.=<<___;
|
||||
|
||||
.align 16
|
||||
.L${mode}_aligned:
|
||||
___
|
||||
$code.=<<___ if ($mode eq "ctr32");
|
||||
mov -4($ctx),%eax # pull 32-bit counter
|
||||
mov \$`16*0x10000`,$chunk
|
||||
bswap %eax
|
||||
cmp $len,$chunk
|
||||
cmova $len,$chunk
|
||||
neg %eax
|
||||
and \$0xffff,%eax
|
||||
jz .L${mode}_aligned_loop
|
||||
mov \$`16*0x10000`,$chunk
|
||||
shl \$4,%eax
|
||||
cmovz $chunk,%rax
|
||||
cmp %rax,$len
|
||||
cmova %rax,$chunk # don't let counter cross 2^16
|
||||
jmp .L${mode}_aligned_loop
|
||||
.align 16
|
||||
cmovbe $len,$chunk
|
||||
jbe .L${mode}_aligned_skip
|
||||
|
||||
.L${mode}_aligned_loop:
|
||||
cmp $len,$chunk
|
||||
cmova $len,$chunk
|
||||
mov $len,%r10 # save parameters
|
||||
mov $chunk,$len
|
||||
mov $chunk,%r11
|
||||
|
||||
lea -16($ctx),%rax # ivp
|
||||
lea 16($ctx),%rbx # key
|
||||
shr \$4,$len # len/=AES_BLOCK_SIZE
|
||||
.byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
|
||||
|
||||
mov -4($ctx),%eax # pull 32-bit counter
|
||||
bswap %eax
|
||||
add \$0x10000,%eax
|
||||
bswap %eax
|
||||
mov %eax,-4($ctx)
|
||||
|
||||
mov %r10,$len # restore paramters
|
||||
sub %r11,$len
|
||||
mov \$`16*0x10000`,$chunk
|
||||
jz .L${mode}_exit
|
||||
cmp $chunk,$len
|
||||
jae .L${mode}_aligned_loop
|
||||
|
||||
.L${mode}_aligned_skip:
|
||||
___
|
||||
$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
|
||||
lea ($inp,$len),%rbp
|
||||
neg %rbp
|
||||
and \$0xfff,%rbp # distance to page boundary
|
||||
xor %eax,%eax
|
||||
cmp \$$PADLOCK_PREFETCH{$mode},%rbp
|
||||
mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
|
||||
cmovae %rax,%rbp
|
||||
and $len,%rbp # remainder
|
||||
sub %rbp,$len
|
||||
jz .L${mode}_aligned_tail
|
||||
___
|
||||
$code.=<<___;
|
||||
lea -16($ctx),%rax # ivp
|
||||
|
@ -452,18 +515,23 @@ $code.=<<___ if ($mode !~ /ecb|ctr/);
|
|||
movdqa (%rax),%xmm0
|
||||
movdqa %xmm0,-16($ctx) # copy [or refresh] iv
|
||||
___
|
||||
$code.=<<___ if ($mode eq "ctr32");
|
||||
mov -4($ctx),%eax # pull 32-bit counter
|
||||
bswap %eax
|
||||
add \$0x10000,%eax
|
||||
bswap %eax
|
||||
mov %eax,-4($ctx)
|
||||
$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
|
||||
test %rbp,%rbp # check remainder
|
||||
jz .L${mode}_exit
|
||||
|
||||
mov %r11,$chunk # restore paramters
|
||||
mov %r10,$len
|
||||
sub $chunk,$len
|
||||
mov \$`16*0x10000`,$chunk
|
||||
jnz .L${mode}_aligned_loop
|
||||
.L${mode}_aligned_tail:
|
||||
mov $out,%r8
|
||||
mov %rbp,$chunk
|
||||
mov %rbp,$len
|
||||
lea (%rsp),%rbp
|
||||
sub $len,%rsp
|
||||
shr \$3,$len
|
||||
lea (%rsp),$out
|
||||
.byte 0xf3,0x48,0xa5 # rep movsq
|
||||
lea (%r8),$out
|
||||
lea (%rsp),$inp
|
||||
mov $chunk,$len
|
||||
jmp .L${mode}_loop
|
||||
___
|
||||
$code.=<<___;
|
||||
.L${mode}_exit:
|
||||
|
|
Loading…
Reference in a new issue