Minor sha[256|512]-586 performance tweaks.

This commit is contained in:
Andy Polyakov 2007-09-16 18:47:24 +00:00
parent cc3d7bd0fc
commit 8dc899dee4
2 changed files with 36 additions and 28 deletions

View file

@ -45,13 +45,17 @@ $Xoff=&DWP(32,"esp");
$K256="ebp";
sub BODY_00_15() {
my $in_16_64=shift;
&mov ("ecx",$E);
&add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_64); # T += X[-7]
&ror ("ecx",6);
&mov ("edi",$E);
&ror ("edi",11);
&mov ("esi",$Foff);
&xor ("ecx","edi");
&ror ("edi",25-11);
&mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_64); # save X[0]
&xor ("ecx","edi"); # Sigma1(e)
&mov ("edi",$Goff);
&add ($T,"ecx"); # T += Sigma1(e)
@ -88,6 +92,7 @@ sub BODY_00_15() {
&add ($K256,4);
&add ($A,$T); # h += T
&mov ($T,&DWP(4*(8+15+16-1),"esp")) if ($in_16_64); # preload T
&add ($E,"esi"); # d += K256[i]
&add ($A,"esi"); # h += K256[i]
}
@ -159,10 +164,10 @@ sub BODY_00_15() {
&cmp ("esi",0xc19bf174);
&jne (&label("00_15"));
&mov ($T,&DWP(4*(8+15+16-1),"esp")); # preloaded in BODY_00_15(1)
&set_label("16_63",16);
&mov ($T,&DWP(4*(8+15+16-1),"esp"));
&mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
&mov ("esi",$T);
&mov ("ecx",&DWP(4*(8+15+16-14),"esp"));
&shr ($T,3);
&ror ("esi",7);
&xor ($T,"esi");
@ -176,13 +181,13 @@ sub BODY_00_15() {
&xor ("ecx","edi");
&ror ("edi",19-17);
&add ($T,"esi"); # T += X[-16]
&xor ("ecx","edi") # sigma1(X[-2])
&xor ("edi","ecx") # sigma1(X[-2])
&add ($T,"ecx"); # T += sigma1(X[-2])
&add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7]
&mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
&add ($T,"edi"); # T += sigma1(X[-2])
# &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1)
# &mov (&DWP(4*(8+15),"esp"),$T); # save X[0]
&BODY_00_15();
&BODY_00_15(1);
&cmp ("esi",0xc67178f2);
&jne (&label("16_63"));

View file

@ -68,6 +68,8 @@ $E="mm4"; # F-H are commonly loaded to respectively mm1-mm3 and
# mm5-mm7, but it's done on on-demand basis...
sub BODY_00_15_sse2 {
my $prefetch=shift;
&movq ("mm5",$Fsse2); # load f
&movq ("mm6",$Gsse2); # load g
&movq ("mm7",$Hsse2); # load h
@ -96,7 +98,7 @@ sub BODY_00_15_sse2 {
&pxor ("mm5","mm6"); # f^=g
&movq ($E,$Dsse2); # e = load d
&paddq ("mm3","mm5"); # T1+=Ch(e,f,g)
&movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
&paddq ("mm3","mm7"); # T1+=h
&movq ("mm5",$A); # %mm5 is sliding right
@ -114,15 +116,16 @@ sub BODY_00_15_sse2 {
&pxor ("mm7","mm6");
&psllq ("mm6",6);
&pxor ("mm7","mm5");
&movq (&QWP(0,"esp"),$A); # modulo-scheduled save a
&sub ("esp",8);
&pxor ("mm7","mm6"); # T2=Sigma0_512(a)
&movq ("mm5",$A); # %mm5=a
&por ($A,"mm2"); # a=a|c
&movq ("mm6",&QWP(8*(9+16-14),"esp")) if ($prefetch);
&pand ("mm5","mm2"); # %mm5=a&c
&pand ($A,"mm1"); # a=(a|c)&b
&movq ("mm2",&QWP(8*(9+16-1),"esp")) if ($prefetch);
&por ("mm5",$A); # %mm5=(a&c)|((a|c)&b)
&sub ("esp",8);
&paddq ("mm7","mm5"); # T2+=Maj(a,b,c)
&movq ($A,"mm3"); # a=T1
@ -327,48 +330,48 @@ if ($sse2) {
&cmp (&LB("edx"),0x35);
&jne (&label("00_14_sse2"));
&BODY_00_15_sse2();
&BODY_00_15_sse2(1);
&set_label("16_79_sse2",16);
&movq ("mm3",&QWP(8*(9+16-1),"esp"));
&movq ("mm6",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm3");
#&movq ("mm2",&QWP(8*(9+16-1),"esp")); #prefetched in BODY_00_15
#&movq ("mm6",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm2");
&psrlq ("mm3",1);
&psrlq ("mm2",1);
&movq ("mm7","mm6");
&psrlq ("mm6",6);
&movq ("mm2","mm3");
&movq ("mm3","mm2");
&psrlq ("mm3",7-1);
&psrlq ("mm2",7-1);
&movq ("mm5","mm6");
&psrlq ("mm6",19-6);
&pxor ("mm2","mm3");
&pxor ("mm3","mm2");
&psrlq ("mm3",8-7);
&psrlq ("mm2",8-7);
&pxor ("mm5","mm6");
&psrlq ("mm6",61-19);
&pxor ("mm2","mm3");
&pxor ("mm3","mm2");
&movq ("mm3",&QWP(8*(9+16),"esp"));
&movq ("mm2",&QWP(8*(9+16),"esp"));
&psllq ("mm1",56);
&pxor ("mm5","mm6");
&psllq ("mm7",3);
&pxor ("mm2","mm1");
&pxor ("mm3","mm1");
&paddq ("mm3",&QWP(8*(9+16-9),"esp"));
&paddq ("mm2",&QWP(8*(9+16-9),"esp"));
&psllq ("mm1",63-56);
&pxor ("mm5","mm7");
&psllq ("mm7",45-3);
&pxor ("mm2","mm1");
&pxor ("mm3","mm1");
&pxor ("mm5","mm7");
&paddq ("mm2","mm5");
&paddq ("mm2","mm3");
&movq (&QWP(8*9,"esp"),"mm2");
&paddq ("mm3","mm5");
&paddq ("mm3","mm2");
&movq (&QWP(8*9,"esp"),"mm3");
&BODY_00_15_sse2();
&BODY_00_15_sse2(1);
&cmp (&LB("edx"),0x17);
&jne (&label("16_79_sse2"));