x86_64 assembly pack: add Goldmont performance results.

Reviewed-by: Richard Levitte <levitte@openssl.org>
2016-10-14 13:25:06 +02:00 · 2016-10-14 13:25:06 +02:00 · ace05265d2
commit ace05265d2
parent c3086f4630
11 changed files with 14 additions and 1 deletions
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@ -179,6 +179,7 @@
 # Haswell	4.44/0.63	0.63	0.73	0.63	0.70
 # Skylake	2.62/0.63	0.63	0.63	0.63
 # Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
+# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
 # Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
 #
 # (*)	Atom Silvermont ECB result is suboptimal because of penalties
--- a/crypto/aes/asm/bsaes-x86_64.pl
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@ -48,6 +48,7 @@
 # Nehalem(**) 	7.63		6.88		+11%
 # Atom	    	17.1		16.4		+4%
 # Silvermont	-		12.9
+# Goldmont	-		8.85
 #
 # (*)	Comparison is not completely fair, because "this" is ECB,
 #	i.e. no extra processing such as counter values calculation
@ -87,6 +88,7 @@
 # Nehalem	7.80
 # Atom		17.9
 # Silvermont	14.0
+# Goldmont	10.2
 #
 # November 2011.
 #
--- a/crypto/aes/asm/vpaes-x86_64.pl
+++ b/crypto/aes/asm/vpaes-x86_64.pl
@ -38,6 +38,7 @@
 # Nehalem	29.6/40.3/14.6		10.0/11.8
 # Atom		57.3/74.2/32.1		60.9/77.2(***)
 # Silvermont	52.7/64.0/19.5		48.8/60.8(***)
+# Goldmont	38.9/49.0/17.8		10.6/12.6
 #
 # (*)	"Hyper-threading" in the context refers rather to cache shared
 #	among multiple cores, than to specifically Intel HTT. As vast
--- a/crypto/chacha/asm/chacha-x86.pl
+++ b/crypto/chacha/asm/chacha-x86.pl
@ -29,6 +29,7 @@
 # Sandy Bridge	10.5/+47%	3.20
 # Haswell	8.15/+50%	2.83
 # Silvermont	17.4/+36%	8.35
+# Goldmont	13.4/+40%	4.36
 # Sledgehammer	10.2/+54%
 # Bulldozer	13.4/+50%	4.38(*)
 #
--- a/crypto/chacha/asm/chacha-x86_64.pl
+++ b/crypto/chacha/asm/chacha-x86_64.pl
@ -29,6 +29,7 @@
 # Ivy Bridge	6.71/+46%	5.40/6.49	2.41
 # Haswell	5.92/+43%	5.20/6.45	2.42	    1.23
 # Silvermont	12.0/+33%	7.75/7.40	7.03(iii)
+# Goldmont	10.6/+17%	5.10/-		3.28
 # Sledgehammer	7.28/+52%	-/14.2(ii)	-
 # Bulldozer	9.66/+28%	9.85/11.1	3.06(iv)
 # VIA Nano	10.5/+46%	6.72/8.60	6.05
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@ -74,6 +74,7 @@
 # Skylake	0.44(+110%)(if system doesn't support AVX)
 # Bulldozer	1.49(+27%)
 # Silvermont	2.88(+13%)
+# Goldmont	1.08(+24%)

 # March 2013
 #
--- a/crypto/poly1305/asm/poly1305-x86.pl
+++ b/crypto/poly1305/asm/poly1305-x86.pl
@ -30,6 +30,7 @@
 # Sandy Bridge	3.90/+100%	1.36
 # Haswell	3.88/+70%	1.18		0.72
 # Silvermont	11.0/+40%	4.80
+# Goldmont	4.10/+200%	2.10
 # VIA Nano	6.71/+90%	2.47
 # Sledgehammer	3.51/+180%	4.27
 # Bulldozer	4.53/+140%	1.31
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@ -29,6 +29,7 @@
 # Haswell	1.14/+175%	1.11		0.65
 # Skylake	1.13/+120%	0.96		0.51
 # Silvermont	2.83/+95%	-
+# Goldmont	1.70/+180%	-
 # VIA Nano	1.82/+150%	-
 # Sledgehammer	1.38/+160%	-
 # Bulldozer	2.30/+130%	0.97
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@ -85,9 +85,11 @@
 # VIA Nano	9.32		7.15/+30%
 # Atom		10.3		9.17/+12%
 # Silvermont	13.1(*)		9.37/+40%
+# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
 #
 # (*)	obviously suboptimal result, nothing was done about it,
 #	because SSSE3 code is compiled unconditionally;
+# (**)	SHAEXT result

 $flavour = shift;
 $output  = shift;
--- a/crypto/sha/asm/sha512-586.pl
+++ b/crypto/sha/asm/sha512-586.pl
@ -36,6 +36,7 @@
 # VIA Nano	91	-	52	33	14.7
 # Atom		126	-	68	48(***)	14.7
 # Silvermont	97	-	58	42(***)	17.5
+# Goldmont	80	-	48	19.5	12.0
 #
 # (*)	whichever best applicable.
 # (**)	x86_64 assembler performance is presented for reference
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@ -98,8 +98,9 @@
 # VIA Nano	23.0	16.5(+39%)  -		    14.7    -
 # Atom		23.0	18.9(+22%)  -		    14.7    -
 # Silvermont	27.4	20.6(+33%)  -               17.5    -
+# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
 #
-# (*)	whichever best applicable;
+# (*)	whichever best applicable, including SHAEXT;
 # (**)	switch from ror to shrd stands for fair share of improvement;
 # (***)	execution time is fully determined by remaining integer-only
 #	part, body_00_15; reducing the amount of SIMD instructions