e_padlock-x86[_64].pl: better understanding of prefetch errata and proper

workaround.
This commit is contained in:
Andy Polyakov 2012-03-19 20:23:32 +00:00
parent 884c580e05
commit ed998634cd
2 changed files with 203 additions and 77 deletions

View file

@ -37,7 +37,7 @@ require "x86asm.pl";
&asm_init($ARGV[0],$0);
%PADLOCK_MARGIN=(ecb=>128, cbc=>64); # prefetch errata
%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata
$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16
$ctx="edx";
@ -188,10 +188,6 @@ my ($mode,$opcode) = @_;
&movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
} else {
&xor ("ebx","ebx");
if ($PADLOCK_MARGIN{$mode}) {
&cmp ($len,$PADLOCK_MARGIN{$mode});
&jbe (&label("${mode}_short"));
}
&test (&DWP(0,$ctx),1<<5); # align bit in control word
&jnz (&label("${mode}_aligned"));
&test ($out,0x0f);
@ -212,7 +208,27 @@ my ($mode,$opcode) = @_;
&neg ("eax");
&and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK
&lea ("esp",&DWP(0,"eax","ebp")); # alloca
&mov ("eax",$PADLOCK_CHUNK);
&cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK
&mov ("eax","ebp");
&and ("ebp",-16);
&and ("esp",-16);
&mov (&DWP(16,"ebp"),"eax");
if ($PADLOCK_PREFETCH{$mode}) {
&cmp ($len,$chunk);
&ja (&label("${mode}_loop"));
&mov ("eax",$inp); # check if prefetch crosses page
&cmp ("ebp","esp");
&cmove ("eax",$out);
&add ("eax",$len);
&neg ("eax");
&and ("eax",0xfff); # distance to page boundary
&cmp ("eax",$PADLOCK_PREFETCH{$mode});
&mov ("eax",-$PADLOCK_PREFETCH{$mode});
&cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1
&and ($chunk,"eax");
&jz (&label("${mode}_unaligned_tail"));
}
&jmp (&label("${mode}_loop"));
&set_label("${mode}_loop",16);
@ -276,8 +292,8 @@ my ($mode,$opcode) = @_;
&test ($out,0x0f);
&jz (&label("${mode}_out_aligned"));
&mov ($len,$chunk);
&shr ($len,2);
&lea ($inp,&DWP(0,"esp"));
&shr ($len,2);
&data_byte(0xf3,0xa5); # rep movsl
&sub ($out,$chunk);
&set_label("${mode}_out_aligned");
@ -288,7 +304,30 @@ my ($mode,$opcode) = @_;
&add ($inp,$chunk);
&sub ($len,$chunk);
&mov ($chunk,$PADLOCK_CHUNK);
if (!$PADLOCK_PREFETCH{$mode}) {
&jnz (&label("${mode}_loop"));
} else {
&jz (&label("${mode}_break"));
&cmp ($len,$chunk);
&jae (&label("${mode}_loop"));
&set_label("${mode}_unaligned_tail");
&xor ("eax","eax");
&cmp ("esp","ebp");
&cmove ("eax",$len);
&sub ("esp","eax"); # alloca
&mov ("eax", $out); # save parameters
&mov ($chunk,$len);
&shr ($len,2);
&lea ($out,&DWP(0,"esp"));
&data_byte(0xf3,0xa5); # rep movsl
&mov ($inp,"esp");
&mov ($out,"eax"); # restore parameters
&mov ($len,$chunk);
&jmp (&label("${mode}_loop"));
&set_label("${mode}_break",16);
}
if ($mode ne "ctr32") {
&cmp ("esp","ebp");
&je (&label("${mode}_done"));
@ -302,28 +341,24 @@ my ($mode,$opcode) = @_;
&ja (&label("${mode}_bzero"));
&set_label("${mode}_done");
&mov ("ebp",&DWP(16,"ebp"));
&lea ("esp",&DWP(24,"ebp"));
if ($mode ne "ctr32") {
&jmp (&label("${mode}_exit"));
&set_label("${mode}_short",16);
&xor ("eax","eax");
&lea ("ebp",&DWP(-24,"esp"));
&sub ("eax",$len);
&lea ("esp",&DWP(0,"eax","ebp"));
&and ("esp",-16);
&xor ($chunk,$chunk);
&set_label("${mode}_short_copy");
&movups ("xmm0",&QWP(0,$inp,$chunk));
&lea ($chunk,&DWP(16,$chunk));
&cmp ($len,$chunk);
&movaps (&QWP(-16,"esp",$chunk),"xmm0");
&ja (&label("${mode}_short_copy"));
&mov ($inp,"esp");
&mov ($chunk,$len);
&jmp (&label("${mode}_loop"));
&set_label("${mode}_aligned",16);
if ($PADLOCK_PREFETCH{$mode}) {
&lea ("ebp",&DWP(0,$inp,$len));
&neg ("ebp");
&and ("ebp",0xfff); # distance to page boundary
&xor ("eax","eax");
&cmp ("ebp",$PADLOCK_PREFETCH{$mode});
&mov ("ebp",$PADLOCK_PREFETCH{$mode}-1);
&cmovae ("ebp","eax");
&and ("ebp",$len); # remainder
&sub ($len,"ebp");
&jz (&label("${mode}_aligned_tail"));
}
&lea ("eax",&DWP(-16,$ctx)); # ivp
&lea ("ebx",&DWP(16,$ctx)); # key
&shr ($len,4); # len/=AES_BLOCK_SIZE
@ -332,6 +367,29 @@ my ($mode,$opcode) = @_;
&movaps ("xmm0",&QWP(0,"eax"));
&movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv
}
if ($PADLOCK_PREFETCH{$mode}) {
&test ("ebp","ebp");
&jz (&label("${mode}_exit"));
&set_label("${mode}_aligned_tail");
&mov ($len,"ebp");
&lea ("ebp",&DWP(-24,"esp"));
&mov ("esp","ebp");
&mov ("eax","ebp");
&sub ("esp",$len);
&and ("ebp",-16);
&and ("esp",-16);
&mov (&DWP(16,"ebp"),"eax");
&mov ("eax", $out); # save parameters
&mov ($chunk,$len);
&shr ($len,2);
&lea ($out,&DWP(0,"esp"));
&data_byte(0xf3,0xa5); # rep movsl
&mov ($inp,"esp");
&mov ($out,"eax"); # restore parameters
&mov ($len,$chunk);
&jmp (&label("${mode}_loop"));
}
&set_label("${mode}_exit"); }
&mov ("eax",1);
&lea ("esp",&DWP(4,"esp")); # popf

View file

@ -27,7 +27,7 @@ open STDOUT,"| $^X $xlate $flavour $output";
$code=".text\n";
%PADLOCK_MARGIN=(ecb=>128, cbc=>64, ctr32=>64); # prefetch errata
%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
$ctx="%rdx";
@ -285,17 +285,6 @@ padlock_${mode}_encrypt:
lea 16($ctx),$ctx # control word
xor %eax,%eax
xor %ebx,%ebx
___
# Formally speaking correct condtion is $len<=$margin and $inp+$margin
# crosses page boundary [and next page is unreadable]. But $inp can
# be unaligned in which case data can be copied to $out if latter is
# aligned, in which case $out+$margin has to be checked. Covering all
# cases appears more complicated than just copying short input...
$code.=<<___ if ($PADLOCK_MARGIN{$mode});
cmp \$$PADLOCK_MARGIN{$mode},$len
jbe .L${mode}_short
___
$code.=<<___;
testl \$`1<<5`,($ctx) # align bit in control word
jnz .L${mode}_aligned
test \$0x0f,$out
@ -315,6 +304,8 @@ $code.=<<___;
neg %rax
and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
lea (%rax,%rbp),%rsp
mov \$$PADLOCK_CHUNK,%rax
cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
___
$code.=<<___ if ($mode eq "ctr32");
.L${mode}_reenter:
@ -322,10 +313,27 @@ $code.=<<___ if ($mode eq "ctr32");
bswap %eax
neg %eax
and \$`$PADLOCK_CHUNK/16-1`,%eax
jz .L${mode}_loop
mov \$$PADLOCK_CHUNK,$chunk
shl \$4,%eax
cmovz $chunk,%rax
cmp %rax,$len
cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
cmovbe $len,$chunk
___
$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
cmp $chunk,$len
ja .L${mode}_loop
mov $inp,%rax # check if prefetch crosses page
cmp %rsp,%rbp
cmove $out,%rax
add $len,%rax
neg %rax
and \$0xfff,%rax # distance to page boundary
cmp \$$PADLOCK_PREFETCH{$mode},%rax
mov \$-$PADLOCK_PREFETCH{$mode},%rax
cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
and %rax,$chunk
jz .L${mode}_unaligned_tail
___
$code.=<<___;
jmp .L${mode}_loop
@ -360,12 +368,12 @@ ___
$code.=<<___ if ($mode eq "ctr32");
mov -4($ctx),%eax # pull 32-bit counter
test \$0xffff0000,%eax
jnz .L${mode}_no_corr
jnz .L${mode}_no_carry
bswap %eax
add \$0x10000,%eax
bswap %eax
mov %eax,-4($ctx)
.L${mode}_no_corr:
.L${mode}_no_carry:
___
$code.=<<___;
mov %r8,$out # restore paramters
@ -373,8 +381,8 @@ $code.=<<___;
test \$0x0f,$out
jz .L${mode}_out_aligned
mov $chunk,$len
shr \$3,$len
lea (%rsp),$inp
shr \$3,$len
.byte 0xf3,0x48,0xa5 # rep movsq
sub $chunk,$out
.L${mode}_out_aligned:
@ -384,9 +392,52 @@ $code.=<<___;
add $chunk,$inp
sub $chunk,$len
mov \$$PADLOCK_CHUNK,$chunk
___
if (!$PADLOCK_PREFETCH{$mode}) {
$code.=<<___;
jnz .L${mode}_loop
___
} else {
$code.=<<___;
jz .L${mode}_break
cmp $chunk,$len
jae .L${mode}_loop
___
$code.=<<___ if ($mode eq "ctr32");
mov $len,$chunk
mov $inp,%rax # check if prefetch crosses page
cmp %rsp,%rbp
cmove $out,%rax
add $len,%rax
neg %rax
and \$0xfff,%rax # distance to page boundary
cmp \$$PADLOCK_PREFETCH{$mode},%rax
mov \$-$PADLOCK_PREFETCH{$mode},%rax
cmovae $chunk,%rax
and %rax,$chunk
jnz .L${mode}_loop
___
$code.=<<___;
.L${mode}_unaligned_tail:
xor %eax,%eax
cmp %rsp,%rbp
cmove $len,%rax
mov $out,%r8 # save parameters
mov $len,$chunk
sub %rax,%rsp # alloca
shr \$3,$len
lea (%rsp),$out
.byte 0xf3,0x48,0xa5 # rep movsq
mov %rsp,$inp
mov %r8, $out # restore parameters
mov $chunk,$len
jmp .L${mode}_loop
.align 16
.L${mode}_break:
___
}
$code.=<<___;
cmp %rbp,%rsp
je .L${mode}_done
pxor %xmm0,%xmm0
@ -400,47 +451,59 @@ $code.=<<___;
.L${mode}_done:
lea (%rbp),%rsp
jmp .L${mode}_exit
___
$code.=<<___ if ($PADLOCK_MARGIN{$mode});
.align 16
.L${mode}_short:
mov %rsp,%rbp
sub $len,%rsp
xor $chunk,$chunk
.L${mode}_short_copy:
movups ($inp,$chunk),%xmm0
lea 16($chunk),$chunk
cmp $chunk,$len
movaps %xmm0,-16(%rsp,$chunk)
ja .L${mode}_short_copy
mov %rsp,$inp
mov $len,$chunk
jmp .L${mode}_`${mode} eq "ctr32"?"reenter":"loop"`
___
$code.=<<___;
.align 16
.L${mode}_aligned:
___
$code.=<<___ if ($mode eq "ctr32");
mov -4($ctx),%eax # pull 32-bit counter
mov \$`16*0x10000`,$chunk
bswap %eax
cmp $len,$chunk
cmova $len,$chunk
neg %eax
and \$0xffff,%eax
jz .L${mode}_aligned_loop
mov \$`16*0x10000`,$chunk
shl \$4,%eax
cmovz $chunk,%rax
cmp %rax,$len
cmova %rax,$chunk # don't let counter cross 2^16
jmp .L${mode}_aligned_loop
.align 16
cmovbe $len,$chunk
jbe .L${mode}_aligned_skip
.L${mode}_aligned_loop:
cmp $len,$chunk
cmova $len,$chunk
mov $len,%r10 # save parameters
mov $chunk,$len
mov $chunk,%r11
lea -16($ctx),%rax # ivp
lea 16($ctx),%rbx # key
shr \$4,$len # len/=AES_BLOCK_SIZE
.byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
mov -4($ctx),%eax # pull 32-bit counter
bswap %eax
add \$0x10000,%eax
bswap %eax
mov %eax,-4($ctx)
mov %r10,$len # restore paramters
sub %r11,$len
mov \$`16*0x10000`,$chunk
jz .L${mode}_exit
cmp $chunk,$len
jae .L${mode}_aligned_loop
.L${mode}_aligned_skip:
___
$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
lea ($inp,$len),%rbp
neg %rbp
and \$0xfff,%rbp # distance to page boundary
xor %eax,%eax
cmp \$$PADLOCK_PREFETCH{$mode},%rbp
mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
cmovae %rax,%rbp
and $len,%rbp # remainder
sub %rbp,$len
jz .L${mode}_aligned_tail
___
$code.=<<___;
lea -16($ctx),%rax # ivp
@ -452,18 +515,23 @@ $code.=<<___ if ($mode !~ /ecb|ctr/);
movdqa (%rax),%xmm0
movdqa %xmm0,-16($ctx) # copy [or refresh] iv
___
$code.=<<___ if ($mode eq "ctr32");
mov -4($ctx),%eax # pull 32-bit counter
bswap %eax
add \$0x10000,%eax
bswap %eax
mov %eax,-4($ctx)
$code.=<<___ if ($PADLOCK_PREFETCH{$mode});
test %rbp,%rbp # check remainder
jz .L${mode}_exit
mov %r11,$chunk # restore paramters
mov %r10,$len
sub $chunk,$len
mov \$`16*0x10000`,$chunk
jnz .L${mode}_aligned_loop
.L${mode}_aligned_tail:
mov $out,%r8
mov %rbp,$chunk
mov %rbp,$len
lea (%rsp),%rbp
sub $len,%rsp
shr \$3,$len
lea (%rsp),$out
.byte 0xf3,0x48,0xa5 # rep movsq
lea (%r8),$out
lea (%rsp),$inp
mov $chunk,$len
jmp .L${mode}_loop
___
$code.=<<___;
.L${mode}_exit: