aes-586.pl: Atom-specific optimization, +44/29%, minor improvement on others.
vpaes-x86.pl: minor performance squeeze.
This commit is contained in:
parent
f717abd7c1
commit
89f1eb8213
2 changed files with 195 additions and 189 deletions
|
@ -103,11 +103,12 @@
|
|||
# byte for 128-bit key.
|
||||
#
|
||||
# ECB encrypt ECB decrypt CBC large chunk
|
||||
# P4 56[60] 84[100] 23
|
||||
# AMD K8 48[44] 70[79] 18
|
||||
# PIII 41[50] 61[91] 24
|
||||
# Core 2 32[38] 45[70] 18.5
|
||||
# Pentium 120 160 77
|
||||
# P4 52[54] 83[95] 23
|
||||
# AMD K8 46[41] 66[70] 18
|
||||
# PIII 41[50] 60[77] 24
|
||||
# Core 2 31[36] 45[64] 18.5
|
||||
# Atom 76[100] 96[138] 60
|
||||
# Pentium 115 150 77
|
||||
#
|
||||
# Version 4.1 switches to compact S-box even in key schedule setup.
|
||||
#
|
||||
|
@ -476,24 +477,25 @@ sub enctransform()
|
|||
my $tmp = $tbl;
|
||||
my $r2 = $key ;
|
||||
|
||||
&mov ($acc,$s[$i]);
|
||||
&and ($acc,0x80808080);
|
||||
&mov ($tmp,$acc);
|
||||
&shr ($tmp,7);
|
||||
&and ($tmp,$s[$i]);
|
||||
&lea ($r2,&DWP(0,$s[$i],$s[$i]));
|
||||
&sub ($acc,$tmp);
|
||||
&mov ($acc,$tmp);
|
||||
&shr ($tmp,7);
|
||||
&and ($r2,0xfefefefe);
|
||||
&and ($acc,0x1b1b1b1b);
|
||||
&sub ($acc,$tmp);
|
||||
&mov ($tmp,$s[$i]);
|
||||
&and ($acc,0x1b1b1b1b);
|
||||
&rotr ($tmp,16);
|
||||
&xor ($acc,$r2); # r2
|
||||
&mov ($r2,$s[$i]);
|
||||
|
||||
&xor ($s[$i],$acc); # r0 ^ r2
|
||||
&rotr ($r2,16+8);
|
||||
&xor ($acc,$tmp);
|
||||
&rotl ($s[$i],24);
|
||||
&xor ($acc,$r2);
|
||||
&mov ($tmp,0x80808080) if ($i!=1);
|
||||
&xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
|
||||
&rotr ($tmp,16);
|
||||
&xor ($s[$i],$tmp);
|
||||
&rotr ($tmp,8);
|
||||
&xor ($s[$i],$tmp);
|
||||
}
|
||||
|
||||
&function_begin_B("_x86_AES_encrypt_compact");
|
||||
|
@ -526,6 +528,7 @@ sub enctransform()
|
|||
&enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
|
||||
&enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
|
||||
&enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
|
||||
&mov ($tbl,0x80808080);
|
||||
&enctransform(2);
|
||||
&enctransform(3);
|
||||
&enctransform(0);
|
||||
|
@ -607,82 +610,84 @@ sub sse_enccompact()
|
|||
&pshufw ("mm5","mm4",0x0d); # 15,14,11,10
|
||||
&movd ("eax","mm1"); # 5, 4, 1, 0
|
||||
&movd ("ebx","mm5"); # 15,14,11,10
|
||||
&mov ($__key,$key);
|
||||
|
||||
&movz ($acc,&LB("eax")); # 0
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
|
||||
&pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
|
||||
&movz ("edx",&HB("eax")); # 1
|
||||
&pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
|
||||
&movz ($key,&LB("ebx")); # 10
|
||||
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
|
||||
&shl ("edx",8); # 1
|
||||
&shr ("eax",16); # 5, 4
|
||||
&shl ("edx",8); # 1
|
||||
|
||||
&movz ($acc,&LB("ebx")); # 10
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 10
|
||||
&movz ($key,&HB("ebx")); # 11
|
||||
&shl ($acc,16); # 10
|
||||
&or ("ecx",$acc); # 10
|
||||
&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
|
||||
&movz ($acc,&HB("ebx")); # 11
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
|
||||
&or ("ecx",$acc); # 10
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 11
|
||||
&movz ($key,&HB("eax")); # 5
|
||||
&shl ($acc,24); # 11
|
||||
&or ("edx",$acc); # 11
|
||||
&shr ("ebx",16); # 15,14
|
||||
&or ("edx",$acc); # 11
|
||||
|
||||
&movz ($acc,&HB("eax")); # 5
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 5
|
||||
&movz ($key,&HB("ebx")); # 15
|
||||
&shl ($acc,8); # 5
|
||||
&or ("ecx",$acc); # 5
|
||||
&movz ($acc,&HB("ebx")); # 15
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 15
|
||||
&movz ($key,&LB("eax")); # 4
|
||||
&shl ($acc,24); # 15
|
||||
&or ("ecx",$acc); # 15
|
||||
&movd ("mm0","ecx"); # t[0] collected
|
||||
|
||||
&movz ($acc,&LB("eax")); # 4
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 4
|
||||
&movz ($key,&LB("ebx")); # 14
|
||||
&movd ("eax","mm2"); # 7, 6, 3, 2
|
||||
&movz ($acc,&LB("ebx")); # 14
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
|
||||
&shl ($acc,16); # 14
|
||||
&movd ("mm0","ecx"); # t[0] collected
|
||||
&movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
|
||||
&movz ($key,&HB("eax")); # 3
|
||||
&shl ("ecx",16); # 14
|
||||
&movd ("ebx","mm6"); # 13,12, 9, 8
|
||||
&or ("ecx",$acc); # 14
|
||||
|
||||
&movd ("ebx","mm6"); # 13,12, 9, 8
|
||||
&movz ($acc,&HB("eax")); # 3
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 3
|
||||
&movz ($key,&HB("ebx")); # 9
|
||||
&shl ($acc,24); # 3
|
||||
&or ("ecx",$acc); # 3
|
||||
&movz ($acc,&HB("ebx")); # 9
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 9
|
||||
&movz ($key,&LB("ebx")); # 8
|
||||
&shl ($acc,8); # 9
|
||||
&or ("ecx",$acc); # 9
|
||||
&movd ("mm1","ecx"); # t[1] collected
|
||||
|
||||
&movz ($acc,&LB("ebx")); # 8
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
|
||||
&shr ("ebx",16); # 13,12
|
||||
&movz ($acc,&LB("eax")); # 2
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
|
||||
&shl ($acc,16); # 2
|
||||
&or ("ecx",$acc); # 2
|
||||
&or ("ecx",$acc); # 9
|
||||
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 8
|
||||
&movz ($key,&LB("eax")); # 2
|
||||
&shr ("eax",16); # 7, 6
|
||||
&movd ("mm1","ecx"); # t[1] collected
|
||||
&movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
|
||||
&movz ($key,&HB("eax")); # 7
|
||||
&shl ("ecx",16); # 2
|
||||
&and ("eax",0xff); # 6
|
||||
&or ("ecx",$acc); # 2
|
||||
|
||||
&punpckldq ("mm0","mm1"); # t[0,1] collected
|
||||
|
||||
&movz ($acc,&HB("eax")); # 7
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 7
|
||||
&movz ($key,&HB("ebx")); # 13
|
||||
&shl ($acc,24); # 7
|
||||
&or ("ecx",$acc); # 7
|
||||
&and ("eax",0xff); # 6
|
||||
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
|
||||
&shl ("eax",16); # 6
|
||||
&or ("edx","eax"); # 6
|
||||
&movz ($acc,&HB("ebx")); # 13
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
|
||||
&shl ($acc,8); # 13
|
||||
&or ("ecx",$acc); # 13
|
||||
&movd ("mm4","ecx"); # t[2] collected
|
||||
&and ("ebx",0xff); # 12
|
||||
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
|
||||
&or ("ecx",$acc); # 7
|
||||
&shl ("eax",16); # 6
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 13
|
||||
&or ("edx","eax"); # 6
|
||||
&shl ($acc,8); # 13
|
||||
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
|
||||
&or ("ecx",$acc); # 13
|
||||
&or ("edx","ebx"); # 12
|
||||
&mov ($key,$__key);
|
||||
&movd ("mm4","ecx"); # t[2] collected
|
||||
&movd ("mm5","edx"); # t[3] collected
|
||||
|
||||
&punpckldq ("mm4","mm5"); # t[2,3] collected
|
||||
|
@ -1270,30 +1275,30 @@ sub dectransform()
|
|||
my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
|
||||
my $tp8 = $tbl;
|
||||
|
||||
&mov ($acc,$s[$i]);
|
||||
&and ($acc,0x80808080);
|
||||
&mov ($tmp,$acc);
|
||||
&mov ($tmp,0x80808080);
|
||||
&and ($tmp,$s[$i]);
|
||||
&mov ($acc,$tmp);
|
||||
&shr ($tmp,7);
|
||||
&lea ($tp2,&DWP(0,$s[$i],$s[$i]));
|
||||
&sub ($acc,$tmp);
|
||||
&and ($tp2,0xfefefefe);
|
||||
&and ($acc,0x1b1b1b1b);
|
||||
&xor ($acc,$tp2);
|
||||
&mov ($tp2,$acc);
|
||||
&xor ($tp2,$acc);
|
||||
&mov ($tmp,0x80808080);
|
||||
|
||||
&and ($acc,0x80808080);
|
||||
&mov ($tmp,$acc);
|
||||
&and ($tmp,$tp2);
|
||||
&mov ($acc,$tmp);
|
||||
&shr ($tmp,7);
|
||||
&lea ($tp4,&DWP(0,$tp2,$tp2));
|
||||
&sub ($acc,$tmp);
|
||||
&and ($tp4,0xfefefefe);
|
||||
&and ($acc,0x1b1b1b1b);
|
||||
&xor ($tp2,$s[$i]); # tp2^tp1
|
||||
&xor ($acc,$tp4);
|
||||
&mov ($tp4,$acc);
|
||||
&xor ($tp4,$acc);
|
||||
&mov ($tmp,0x80808080);
|
||||
|
||||
&and ($acc,0x80808080);
|
||||
&mov ($tmp,$acc);
|
||||
&and ($tmp,$tp4);
|
||||
&mov ($acc,$tmp);
|
||||
&shr ($tmp,7);
|
||||
&lea ($tp8,&DWP(0,$tp4,$tp4));
|
||||
&sub ($acc,$tmp);
|
||||
|
@ -1305,13 +1310,13 @@ sub dectransform()
|
|||
|
||||
&xor ($s[$i],$tp2);
|
||||
&xor ($tp2,$tp8);
|
||||
&rotl ($tp2,24);
|
||||
&xor ($s[$i],$tp4);
|
||||
&xor ($tp4,$tp8);
|
||||
&rotl ($tp4,16);
|
||||
&rotl ($tp2,24);
|
||||
&xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
|
||||
&rotl ($tp8,8);
|
||||
&rotl ($tp4,16);
|
||||
&xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
|
||||
&rotl ($tp8,8);
|
||||
&xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
|
||||
&mov ($s[0],$__s0) if($i==2); #prefetch $s0
|
||||
&mov ($s[1],$__s1) if($i==3); #prefetch $s1
|
||||
|
@ -1389,85 +1394,87 @@ sub dectransform()
|
|||
sub sse_deccompact()
|
||||
{
|
||||
&pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
|
||||
&movd ("eax","mm1"); # 7, 6, 1, 0
|
||||
|
||||
&pshufw ("mm5","mm4",0x09); # 13,12,11,10
|
||||
&movz ($acc,&LB("eax")); # 0
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
|
||||
&movd ("eax","mm1"); # 7, 6, 1, 0
|
||||
&movd ("ebx","mm5"); # 13,12,11,10
|
||||
&mov ($__key,$key);
|
||||
|
||||
&movz ($acc,&LB("eax")); # 0
|
||||
&movz ("edx",&HB("eax")); # 1
|
||||
&pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
|
||||
&movz ($key,&LB("ebx")); # 10
|
||||
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
|
||||
&shr ("eax",16); # 7, 6
|
||||
&shl ("edx",8); # 1
|
||||
|
||||
&pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
|
||||
&movz ($acc,&LB("ebx")); # 10
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 10
|
||||
&movz ($key,&HB("ebx")); # 11
|
||||
&shl ($acc,16); # 10
|
||||
&or ("ecx",$acc); # 10
|
||||
&shr ("eax",16); # 7, 6
|
||||
&movz ($acc,&HB("ebx")); # 11
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
|
||||
&shl ($acc,24); # 11
|
||||
&or ("edx",$acc); # 11
|
||||
&shr ("ebx",16); # 13,12
|
||||
|
||||
&pshufw ("mm6","mm4",0x03); # 9, 8,15,14
|
||||
&movz ($acc,&HB("eax")); # 7
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
|
||||
&or ("ecx",$acc); # 10
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 11
|
||||
&movz ($key,&HB("eax")); # 7
|
||||
&shl ($acc,24); # 11
|
||||
&shr ("ebx",16); # 13,12
|
||||
&or ("edx",$acc); # 11
|
||||
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 7
|
||||
&movz ($key,&HB("ebx")); # 13
|
||||
&shl ($acc,24); # 7
|
||||
&or ("ecx",$acc); # 7
|
||||
&movz ($acc,&HB("ebx")); # 13
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 13
|
||||
&movz ($key,&LB("eax")); # 6
|
||||
&shl ($acc,8); # 13
|
||||
&or ("ecx",$acc); # 13
|
||||
&movd ("mm0","ecx"); # t[0] collected
|
||||
|
||||
&movz ($acc,&LB("eax")); # 6
|
||||
&movd ("eax","mm2"); # 3, 2, 5, 4
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
|
||||
&shl ("ecx",16); # 6
|
||||
&movz ($acc,&LB("ebx")); # 12
|
||||
&or ("ecx",$acc); # 13
|
||||
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 6
|
||||
&movz ($key,&LB("ebx")); # 12
|
||||
&shl ($acc,16); # 6
|
||||
&movd ("ebx","mm6"); # 9, 8,15,14
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
|
||||
&movd ("mm0","ecx"); # t[0] collected
|
||||
&movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
|
||||
&movz ($key,&LB("eax")); # 4
|
||||
&or ("ecx",$acc); # 12
|
||||
|
||||
&movz ($acc,&LB("eax")); # 4
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 4
|
||||
&movz ($key,&LB("ebx")); # 14
|
||||
&or ("edx",$acc); # 4
|
||||
&movz ($acc,&LB("ebx")); # 14
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 14
|
||||
&movz ($key,&HB("eax")); # 5
|
||||
&shl ($acc,16); # 14
|
||||
&or ("edx",$acc); # 14
|
||||
&movd ("mm1","edx"); # t[1] collected
|
||||
|
||||
&movz ($acc,&HB("eax")); # 5
|
||||
&movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
|
||||
&shl ("edx",8); # 5
|
||||
&movz ($acc,&HB("ebx")); # 15
|
||||
&shr ("eax",16); # 3, 2
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
|
||||
&shl ($acc,24); # 15
|
||||
&or ("edx",$acc); # 15
|
||||
&or ("edx",$acc); # 14
|
||||
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 5
|
||||
&movz ($key,&HB("ebx")); # 15
|
||||
&shr ("ebx",16); # 9, 8
|
||||
&shl ($acc,8); # 5
|
||||
&movd ("mm1","edx"); # t[1] collected
|
||||
&movz ("edx",&BP(-128,$tbl,$key,1)); # 15
|
||||
&movz ($key,&HB("ebx")); # 9
|
||||
&shl ("edx",24); # 15
|
||||
&and ("ebx",0xff); # 8
|
||||
&or ("edx",$acc); # 15
|
||||
|
||||
&punpckldq ("mm0","mm1"); # t[0,1] collected
|
||||
|
||||
&movz ($acc,&HB("ebx")); # 9
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 9
|
||||
&movz ($key,&LB("eax")); # 2
|
||||
&shl ($acc,8); # 9
|
||||
&or ("ecx",$acc); # 9
|
||||
&and ("ebx",0xff); # 8
|
||||
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
|
||||
&or ("edx","ebx"); # 8
|
||||
&movz ($acc,&LB("eax")); # 2
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
|
||||
&shl ($acc,16); # 2
|
||||
&or ("edx",$acc); # 2
|
||||
&movd ("mm4","edx"); # t[2] collected
|
||||
&movz ("eax",&HB("eax")); # 3
|
||||
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
|
||||
&or ("ecx",$acc); # 9
|
||||
&movz ($acc,&BP(-128,$tbl,$key,1)); # 2
|
||||
&or ("edx","ebx"); # 8
|
||||
&shl ($acc,16); # 2
|
||||
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
|
||||
&or ("edx",$acc); # 2
|
||||
&shl ("eax",24); # 3
|
||||
&or ("ecx","eax"); # 3
|
||||
&mov ($key,$__key);
|
||||
&movd ("mm4","edx"); # t[2] collected
|
||||
&movd ("mm5","ecx"); # t[3] collected
|
||||
|
||||
&punpckldq ("mm4","mm5"); # t[2,3] collected
|
||||
|
@ -2865,32 +2872,32 @@ sub deckey()
|
|||
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
|
||||
my $tmp = $tbl;
|
||||
|
||||
&mov ($acc,$tp1);
|
||||
&and ($acc,0x80808080);
|
||||
&mov ($tmp,$acc);
|
||||
&shr ($tmp,7);
|
||||
&mov ($tmp,0x80808080);
|
||||
&and ($tmp,$tp1);
|
||||
&lea ($tp2,&DWP(0,$tp1,$tp1));
|
||||
&mov ($acc,$tmp);
|
||||
&shr ($tmp,7);
|
||||
&sub ($acc,$tmp);
|
||||
&and ($tp2,0xfefefefe);
|
||||
&and ($acc,0x1b1b1b1b);
|
||||
&xor ($acc,$tp2);
|
||||
&mov ($tp2,$acc);
|
||||
&xor ($tp2,$acc);
|
||||
&mov ($tmp,0x80808080);
|
||||
|
||||
&and ($acc,0x80808080);
|
||||
&mov ($tmp,$acc);
|
||||
&shr ($tmp,7);
|
||||
&and ($tmp,$tp2);
|
||||
&lea ($tp4,&DWP(0,$tp2,$tp2));
|
||||
&mov ($acc,$tmp);
|
||||
&shr ($tmp,7);
|
||||
&sub ($acc,$tmp);
|
||||
&and ($tp4,0xfefefefe);
|
||||
&and ($acc,0x1b1b1b1b);
|
||||
&xor ($tp2,$tp1); # tp2^tp1
|
||||
&xor ($acc,$tp4);
|
||||
&mov ($tp4,$acc);
|
||||
&xor ($tp4,$acc);
|
||||
&mov ($tmp,0x80808080);
|
||||
|
||||
&and ($acc,0x80808080);
|
||||
&mov ($tmp,$acc);
|
||||
&shr ($tmp,7);
|
||||
&and ($tmp,$tp4);
|
||||
&lea ($tp8,&DWP(0,$tp4,$tp4));
|
||||
&mov ($acc,$tmp);
|
||||
&shr ($tmp,7);
|
||||
&xor ($tp4,$tp1); # tp4^tp1
|
||||
&sub ($acc,$tmp);
|
||||
&and ($tp8,0xfefefefe);
|
||||
|
|
|
@ -27,9 +27,9 @@
|
|||
#
|
||||
# aes-586.pl vpaes-x86.pl
|
||||
#
|
||||
# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
|
||||
# Nehalem 27.9/40.4/18.1 10.3/12.0
|
||||
# Atom 102./119./60.1 64.5/85.3(***)
|
||||
# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
|
||||
# Nehalem 27.9/40.4/18.1 10.2/11.9
|
||||
# Atom 70.7/92.1/60.1 61.1/81.0(***)
|
||||
#
|
||||
# (*) "Hyper-threading" in the context refers rather to cache shared
|
||||
# among multiple cores, than to specifically Intel HTT. As vast
|
||||
|
@ -40,8 +40,8 @@
|
|||
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
|
||||
#
|
||||
# (***) Less impressive improvement on Core 2 and Atom is due to slow
|
||||
# pshufb, yet it's respectable +32%/65% improvement on Core 2
|
||||
# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
|
||||
# pshufb, yet it's respectable +28%/64% improvement on Core 2
|
||||
# and +15% on Atom (as implied, over "hyper-threading-safe"
|
||||
# code path).
|
||||
#
|
||||
# <appro@openssl.org>
|
||||
|
@ -183,35 +183,35 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
|||
&movdqa ("xmm1","xmm6")
|
||||
&movdqa ("xmm2",&QWP($k_ipt,$const));
|
||||
&pandn ("xmm1","xmm0");
|
||||
&movdqu ("xmm5",&QWP(0,$key));
|
||||
&psrld ("xmm1",4);
|
||||
&pand ("xmm0","xmm6");
|
||||
&movdqu ("xmm5",&QWP(0,$key));
|
||||
&pshufb ("xmm2","xmm0");
|
||||
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
|
||||
&pshufb ("xmm0","xmm1");
|
||||
&pxor ("xmm2","xmm5");
|
||||
&pxor ("xmm0","xmm2");
|
||||
&psrld ("xmm1",4);
|
||||
&add ($key,16);
|
||||
&pshufb ("xmm0","xmm1");
|
||||
&lea ($base,&DWP($k_mc_backward,$const));
|
||||
&pxor ("xmm0","xmm2");
|
||||
&jmp (&label("enc_entry"));
|
||||
|
||||
|
||||
&set_label("enc_loop",16);
|
||||
# middle of middle round
|
||||
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sb1u
|
||||
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
|
||||
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sb1u
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sb1t
|
||||
&pxor ("xmm0","xmm4"); # 0 = A
|
||||
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
|
||||
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
|
||||
&pshufb ("xmm5","xmm2"); # 4 = sb2u
|
||||
&pxor ("xmm0","xmm4"); # 0 = A
|
||||
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
|
||||
&pshufb ("xmm5","xmm2"); # 4 = sb2u
|
||||
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
|
||||
&pshufb ("xmm2","xmm3"); # 2 = sb2t
|
||||
&pxor ("xmm2","xmm5"); # 2 = 2A
|
||||
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
|
||||
&pshufb ("xmm2","xmm3"); # 2 = sb2t
|
||||
&movdqa ("xmm3","xmm0"); # 3 = A
|
||||
&pxor ("xmm2","xmm5"); # 2 = 2A
|
||||
&pshufb ("xmm0","xmm1"); # 0 = B
|
||||
&add ($key,16); # next key
|
||||
&pxor ("xmm0","xmm2"); # 0 = 2A+B
|
||||
|
@ -220,30 +220,30 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
|||
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
|
||||
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
|
||||
&and ($magic,0x30); # ... mod 4
|
||||
&pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
|
||||
&sub ($round,1); # nr--
|
||||
&pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
|
||||
|
||||
&set_label("enc_entry");
|
||||
# top of round
|
||||
&movdqa ("xmm1","xmm6"); # 1 : i
|
||||
&movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
|
||||
&pandn ("xmm1","xmm0"); # 1 = i<<4
|
||||
&psrld ("xmm1",4); # 1 = i
|
||||
&pand ("xmm0","xmm6"); # 0 = k
|
||||
&movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
|
||||
&pshufb ("xmm5","xmm0"); # 2 = a/k
|
||||
&pxor ("xmm0","xmm1"); # 0 = j
|
||||
&movdqa ("xmm3","xmm7"); # 3 : 1/i
|
||||
&pxor ("xmm0","xmm1"); # 0 = j
|
||||
&pshufb ("xmm3","xmm1"); # 3 = 1/i
|
||||
&pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
|
||||
&movdqa ("xmm4","xmm7"); # 4 : 1/j
|
||||
&pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
|
||||
&pshufb ("xmm4","xmm0"); # 4 = 1/j
|
||||
&pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
|
||||
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
|
||||
&pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
|
||||
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
|
||||
&pxor ("xmm2","xmm0"); # 2 = io
|
||||
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
|
||||
&movdqu ("xmm5",&QWP(0,$key));
|
||||
&pxor ("xmm2","xmm0"); # 2 = io
|
||||
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
|
||||
&movdqu ("xmm5",&QWP(0,$key));
|
||||
&pxor ("xmm3","xmm1"); # 3 = jo
|
||||
&jnz (&label("enc_loop"));
|
||||
|
||||
|
@ -265,8 +265,8 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
|||
## Same API as encryption core.
|
||||
##
|
||||
&function_begin_B("_vpaes_decrypt_core");
|
||||
&mov ($round,&DWP(240,$key));
|
||||
&lea ($base,&DWP($k_dsbd,$const));
|
||||
&mov ($round,&DWP(240,$key));
|
||||
&movdqa ("xmm1","xmm6");
|
||||
&movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
|
||||
&pandn ("xmm1","xmm0");
|
||||
|
@ -292,62 +292,61 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
|||
## Inverse mix columns
|
||||
##
|
||||
&movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
|
||||
&movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sb9u
|
||||
&pshufb ("xmm1","xmm3"); # 0 = sb9t
|
||||
&pxor ("xmm4","xmm0");
|
||||
&movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sb9t
|
||||
&pxor ("xmm0","xmm4"); # 0 = ch
|
||||
&add ($key,16); # next round key
|
||||
&pxor ("xmm1","xmm4"); # 0 = ch
|
||||
|
||||
&pshufb ("xmm0","xmm5"); # MC ch
|
||||
&movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
|
||||
&pshufb ("xmm1","xmm5"); # MC ch
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbdu
|
||||
&pxor ("xmm4","xmm0"); # 4 = ch
|
||||
&movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
|
||||
&pxor ("xmm4","xmm1"); # 4 = ch
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sbdt
|
||||
&pxor ("xmm0","xmm4"); # 0 = ch
|
||||
&sub ($round,1); # nr--
|
||||
&pxor ("xmm0","xmm4"); # 0 = ch
|
||||
|
||||
&pshufb ("xmm0","xmm5"); # MC ch
|
||||
&movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbbu
|
||||
&pxor ("xmm4","xmm0"); # 4 = ch
|
||||
&movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sbbt
|
||||
&pxor ("xmm0","xmm4"); # 0 = ch
|
||||
|
||||
&pshufb ("xmm0","xmm5"); # MC ch
|
||||
&movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbeu
|
||||
&movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbbu
|
||||
&pshufb ("xmm1","xmm3"); # 0 = sbbt
|
||||
&pxor ("xmm4","xmm0"); # 4 = ch
|
||||
&movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sbet
|
||||
&pxor ("xmm0","xmm4"); # 0 = ch
|
||||
&pxor ("xmm1","xmm4"); # 0 = ch
|
||||
|
||||
&movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
|
||||
&pshufb ("xmm1","xmm5"); # MC ch
|
||||
&movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
|
||||
&pshufb ("xmm4","xmm2"); # 4 = sbeu
|
||||
&pshufb ("xmm0","xmm3"); # 0 = sbet
|
||||
&palignr("xmm5","xmm5",12);
|
||||
&pxor ("xmm4","xmm1"); # 4 = ch
|
||||
&pxor ("xmm0","xmm4"); # 0 = ch
|
||||
|
||||
&set_label("dec_entry");
|
||||
# top of round
|
||||
&movdqa ("xmm1","xmm6"); # 1 : i
|
||||
&pandn ("xmm1","xmm0"); # 1 = i<<4
|
||||
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
|
||||
&psrld ("xmm1",4); # 1 = i
|
||||
&pand ("xmm0","xmm6"); # 0 = k
|
||||
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
|
||||
&pshufb ("xmm2","xmm0"); # 2 = a/k
|
||||
&pxor ("xmm0","xmm1"); # 0 = j
|
||||
&movdqa ("xmm3","xmm7"); # 3 : 1/i
|
||||
&pxor ("xmm0","xmm1"); # 0 = j
|
||||
&pshufb ("xmm3","xmm1"); # 3 = 1/i
|
||||
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
|
||||
&movdqa ("xmm4","xmm7"); # 4 : 1/j
|
||||
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
|
||||
&pshufb ("xmm4","xmm0"); # 4 = 1/j
|
||||
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
|
||||
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
|
||||
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
|
||||
&pxor ("xmm2","xmm0"); # 2 = io
|
||||
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
|
||||
&pxor ("xmm2","xmm0"); # 2 = io
|
||||
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
|
||||
&pxor ("xmm3","xmm1"); # 3 = jo
|
||||
&movdqu ("xmm0",&QWP(0,$key));
|
||||
&pxor ("xmm3","xmm1"); # 3 = jo
|
||||
&jnz (&label("dec_loop"));
|
||||
|
||||
# middle of last round
|
||||
|
@ -542,12 +541,12 @@ $k_dsbo=0x2c0; # decryption sbox final output
|
|||
## %xmm0: b+c+d b+c b a
|
||||
##
|
||||
&function_begin_B("_vpaes_schedule_192_smear");
|
||||
&pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
|
||||
&pxor ("xmm6","xmm0"); # -> c+d c 0 0
|
||||
&pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0
|
||||
&pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
|
||||
&pxor ("xmm6","xmm1"); # -> c+d c 0 0
|
||||
&pxor ("xmm1","xmm1");
|
||||
&pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
|
||||
&movdqa ("xmm0","xmm6");
|
||||
&pxor ("xmm1","xmm1");
|
||||
&movhlps("xmm6","xmm1"); # clobber low side with zeros
|
||||
&ret ();
|
||||
&function_end_B("_vpaes_schedule_192_smear");
|
||||
|
|
Loading…
Reference in a new issue