s390x assembler pack: adapt for -m31 build, see commentary in Configure

for more details.
2010-11-29 20:52:43 +00:00 · 2010-11-29 20:52:43 +00:00 · e822c756b6
commit e822c756b6
parent 300b1d76fe
9 changed files with 329 additions and 140 deletions
--- a/18
+++ b/18
@ -134,7 +134,7 @@ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void";
 my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o:void";
 my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::";
 my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::";
-my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:void";
+my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o";
 my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void";
 my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32";
 my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64";
@ -356,7 +356,21 @@ my %table=(
 "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-x86_64",	"gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
-"linux-s390x",	"gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
+"linux64-s390x",	"gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
+#### So called "highgprs" target for z/Architecture CPUs
+# "Highgprs" is kernel feature first implemented in Linux 2.6.32, see
+# /proc/cpuinfo. The idea is to preserve most significant bits of
+# general purpose registers not only upon 32-bit process context
+# switch, but even on asynchronous signal delivery to such process.
+# This makes it possible to deploy 64-bit instructions even in legacy
+# application context and achieve better [or should we say adequate]
+# performance. The build is binary compatible with linux-generic32,
+# and the idea is to be able to install the resulting libcrypto.so
+# alongside generic one, e.g. as /lib/highgprs/libcrypto.so.x.y, for
+# ldconfig and run-time linker to autodiscover. Unfortunately it
+# doesn't work just yet, because of couple of bugs in glibc
+# sysdep/s390/dl-procinfo.c affecting ldconfig and ld.so.1...
+"linux32-s390x",	"gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs",
 #### SPARC Linux setups
 # Ray Miller <ray.miller@computing-services.oxford.ac.uk> has patiently
 # assisted with debugging of following two configs.
--- a/13
+++ b/13
@ -629,7 +629,18 @@ case "$GUESSOS" in
  sh*-*-linux2)  OUT="linux-generic32"; options="$options -DL_ENDIAN" ;;
  m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
  s390-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
-  s390x-*-linux2) OUT="linux-s390x" ;;
+  s390x-*-linux2)
+	# To be uncommented when glibc bug is fixed, see Configure...
+	#if egrep -e '^features.* highgprs' /proc/cpuinfo >/dev/null ; then
+	#  echo "WARNING! If you wish to build \"highgprs\" 32-bit library, then you"
+	#  echo "         have to invoke './Configure linux32-s390x' *manually*."
+	#  if [ "$TEST" = "false" -a -t -1 ]; then
+	#    echo "         You have about 5 seconds to press Ctrl-C to abort."
+	#    (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1
+	#  fi
+	#fi
+	OUT="linux64-s390x"
+	;;
  x86_64-*-linux?) OUT="linux-x86_64" ;;
  *86-*-linux2) OUT="linux-elf"
 	if [ "$GCCVER" -gt 28 ]; then
--- a/crypto/aes/asm/aes-s390x.pl
+++ b/crypto/aes/asm/aes-s390x.pl
@ -60,6 +60,26 @@
 # maximum, but *on average* it would be as much as ~98%. Meaning that
 # worst case is unlike, it's like hitting ravine on plateau.

+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2x better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";

@ -82,6 +102,8 @@ $rounds="%r13";
 $ra="%r14";
 $sp="%r15";

+$stdframe=16*$SIZE_T+4*8;
+
 sub _data_word()
 { my $i;
    while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@ -223,7 +245,7 @@ $code.=<<___ if (!$softonly);
 .Lesoft:
 ___
 $code.=<<___;
-	stmg	%r3,$ra,24($sp)
+	stm${g}	%r3,$ra,3*$SIZE_T($sp)

 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
@ -233,20 +255,20 @@ $code.=<<___;
 	larl	$tbl,AES_Te
 	bras	$ra,_s390x_AES_encrypt

-	lg	$out,24($sp)
+	l${g}	$out,3*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
 	st	$s3,12($out)

-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_encrypt,.-AES_encrypt

 .type   _s390x_AES_encrypt,\@function
 .align	16
 _s390x_AES_encrypt:
-	stg	$ra,152($sp)
+	st${g}	$ra,`$stdframe-$SIZE_T`($sp)
 	x	$s0,0($key)
 	x	$s1,4($key)
 	x	$s2,8($key)
@ -410,7 +432,7 @@ _s390x_AES_encrypt:
 	or	$s2,$i3
 	or	$s3,$t3

-	lg	$ra,152($sp)
+	l${g}	$ra,`$stdframe-$SIZE_T`($sp)
 	xr	$s0,$t0
 	xr	$s1,$t2
 	x	$s2,24($key)
@ -549,7 +571,7 @@ $code.=<<___ if (!$softonly);
 .Ldsoft:
 ___
 $code.=<<___;
-	stmg	%r3,$ra,24($sp)
+	stm${g}	%r3,$ra,3*$SIZE_T($sp)

 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
@ -559,20 +581,20 @@ $code.=<<___;
 	larl	$tbl,AES_Td
 	bras	$ra,_s390x_AES_decrypt

-	lg	$out,24($sp)
+	l${g}	$out,3*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
 	st	$s3,12($out)

-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_decrypt,.-AES_decrypt

 .type   _s390x_AES_decrypt,\@function
 .align	16
 _s390x_AES_decrypt:
-	stg	$ra,152($sp)
+	st${g}	$ra,`$stdframe-$SIZE_T`($sp)
 	x	$s0,0($key)
 	x	$s1,4($key)
 	x	$s2,8($key)
@ -716,7 +738,7 @@ _s390x_AES_decrypt:
 	nr	$i1,$mask
 	nr	$i2,$mask

-	lg	$ra,152($sp)
+	l${g}	$ra,`$stdframe-$SIZE_T`($sp)
 	or	$s1,$t1
 	l	$t0,16($key)
 	l	$t1,20($key)
@ -750,9 +772,9 @@ $code.=<<___;
 .align	16
 AES_set_encrypt_key:
 	lghi	$t0,0
-	clgr	$inp,$t0
+	cl${g}r	$inp,$t0
 	je	.Lminus1
-	clgr	$key,$t0
+	cl${g}r	$key,$t0
 	je	.Lminus1

 	lghi	$t0,128
@ -810,7 +832,7 @@ ___
 $code.=<<___;
 .align	16
 .Lekey_internal:
-	stmg	%r6,%r13,48($sp)	# all non-volatile regs
+	stm${g}	%r6,%r13,6*$SIZE_T($sp)	# all non-volatile regs

 	larl	$tbl,AES_Te+2048

@ -871,7 +893,7 @@ $code.=<<___;
 	la	$t3,4($t3)		# i++
 	brct	$rounds,.L128_loop
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)
 	br	$ra

 .align	16
@ -919,7 +941,7 @@ $code.=<<___;
 	st	$s3,36($key)
 	brct	$rounds,.L192_continue
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)
 	br	$ra

 .align	16
@ -981,7 +1003,7 @@ $code.=<<___;
 	st	$s3,44($key)
 	brct	$rounds,.L256_continue
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)
 	br	$ra

 .align	16
@ -1032,11 +1054,11 @@ $code.=<<___;
 .type	AES_set_decrypt_key,\@function
 .align	16
 AES_set_decrypt_key:
-	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
-	stg	$ra,112($sp)		# save non-volatile registers!
+	st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
+	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers!
 	bras	$ra,AES_set_encrypt_key
-	lg	$key,32($sp)
-	lg	$ra,112($sp)
+	l${g}	$key,4*$SIZE_T($sp)
+	l${g}	$ra,14*$SIZE_T($sp)
 	ltgr	%r2,%r2
 	bnzr	$ra
 ___
@ -1051,11 +1073,11 @@ $code.=<<___ if (!$softonly);

 .align	16
 .Ldkey_internal:
-	stg	$key,32($sp)
-	stg	$ra,40($sp)
+	st${g}	$key,4*$SIZE_T($sp)
+	st${g}	$ra,14*$SIZE_T($sp)
 	bras	$ra,.Lekey_internal
-	lg	$key,32($sp)
-	lg	$ra,40($sp)
+	l${g}	$key,4*$SIZE_T($sp)
+	l${g}	$ra,14*$SIZE_T($sp)
 ___
 $code.=<<___;

@ -1136,7 +1158,7 @@ $code.=<<___;
 	la	$key,4($key)
 	brct	$rounds,.Lmix

-	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
 	lghi	%r2,0
 	br	$ra
 .size	AES_set_decrypt_key,.-AES_set_decrypt_key
@ -1176,7 +1198,7 @@ $code.=<<___ if (!$softonly);
 	l	%r0,240($key)	# load kmc code
 	lghi	$key,15		# res=len%16, len-=res;
 	ngr	$key,$len
-	slgr	$len,$key
+	sl${g}r	$len,$key
 	la	%r1,16($sp)	# parameter block - ivec || key
 	jz	.Lkmc_truncated
 	.long	0xb92f0042	# kmc %r4,%r2
@ -1194,34 +1216,34 @@ $code.=<<___ if (!$softonly);
 	tmll	%r0,0x80
 	jnz	.Lkmc_truncated_dec
 	lghi	%r1,0
-	stg	%r1,128($sp)
-	stg	%r1,136($sp)
+	stg	%r1,16*$SIZE_T($sp)
+	stg	%r1,16*$SIZE_T+8($sp)
 	bras	%r1,1f
-	mvc	128(1,$sp),0($inp)
+	mvc	16*$SIZE_T(1,$sp),0($inp)
 1:	ex	$key,0(%r1)
 	la	%r1,16($sp)	# restore parameter block
-	la	$inp,128($sp)
+	la	$inp,16*$SIZE_T($sp)
 	lghi	$len,16
 	.long	0xb92f0042	# kmc %r4,%r2
 	j	.Lkmc_done
 .align	16
 .Lkmc_truncated_dec:
-	stg	$out,64($sp)
-	la	$out,128($sp)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$out,16*$SIZE_T($sp)
 	lghi	$len,16
 	.long	0xb92f0042	# kmc %r4,%r2
-	lg	$out,64($sp)
+	l${g}	$out,4*$SIZE_T($sp)
 	bras	%r1,2f
-	mvc	0(1,$out),128($sp)
+	mvc	0(1,$out),16*$SIZE_T($sp)
 2:	ex	$key,0(%r1)
 	j	.Lkmc_done
 .align	16
 .Lcbc_software:
 ___
 $code.=<<___;
-	stmg	$key,$ra,40($sp)
+	stm${g}	$key,$ra,5*$SIZE_T($sp)
 	lhi	%r0,0
-	cl	%r0,164($sp)
+	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
 	je	.Lcbc_decrypt

 	larl	$tbl,AES_Te
@ -1232,10 +1254,10 @@ $code.=<<___;
 	llgf	$s3,12($ivp)

 	lghi	$t0,16
-	slgr	$len,$t0
+	sl${g}r	$len,$t0
 	brc	4,.Lcbc_enc_tail	# if borrow
 .Lcbc_enc_loop:
-	stmg	$inp,$out,16($sp)
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
 	x	$s0,0($inp)
 	x	$s1,4($inp)
 	x	$s2,8($inp)
@ -1244,7 +1266,7 @@ $code.=<<___;

 	bras	$ra,_s390x_AES_encrypt

-	lmg	$inp,$key,16($sp)
+	lm${g}	$inp,$key,2*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
@ -1253,33 +1275,33 @@ $code.=<<___;
 	la	$inp,16($inp)
 	la	$out,16($out)
 	lghi	$t0,16
-	ltgr	$len,$len
+	lt${g}r	$len,$len
 	jz	.Lcbc_enc_done
-	slgr	$len,$t0
+	sl${g}r	$len,$t0
 	brc	4,.Lcbc_enc_tail	# if borrow
 	j	.Lcbc_enc_loop
 .align	16
 .Lcbc_enc_done:
-	lg	$ivp,48($sp)
+	l${g}	$ivp,6*$SIZE_T($sp)
 	st	$s0,0($ivp)
 	st	$s1,4($ivp)	
 	st	$s2,8($ivp)
 	st	$s3,12($ivp)

-	lmg	%r7,$ra,56($sp)
+	lm${g}	%r7,$ra,7*$SIZE_T($sp)
 	br	$ra

 .align	16
 .Lcbc_enc_tail:
 	aghi	$len,15
 	lghi	$t0,0
-	stg	$t0,128($sp)
-	stg	$t0,136($sp)
+	stg	$t0,16*$SIZE_T($sp)
+	stg	$t0,16*$SIZE_T+8($sp)
 	bras	$t1,3f
-	mvc	128(1,$sp),0($inp)
+	mvc	16*$SIZE_T(1,$sp),0($inp)
 3:	ex	$len,0($t1)
 	lghi	$len,0
-	la	$inp,128($sp)
+	la	$inp,16*$SIZE_T($sp)
 	j	.Lcbc_enc_loop

 .align	16
@ -1288,10 +1310,10 @@ $code.=<<___;

 	lg	$t0,0($ivp)
 	lg	$t1,8($ivp)
-	stmg	$t0,$t1,128($sp)
+	stmg	$t0,$t1,16*$SIZE_T($sp)

 .Lcbc_dec_loop:
-	stmg	$inp,$out,16($sp)
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
 	llgf	$s2,8($inp)
@ -1300,7 +1322,7 @@ $code.=<<___;

 	bras	$ra,_s390x_AES_decrypt

-	lmg	$inp,$key,16($sp)
+	lm${g}	$inp,$key,2*$SIZE_T($sp)
 	sllg	$s0,$s0,32
 	sllg	$s2,$s2,32
 	lr	$s0,$s1
@ -1308,15 +1330,15 @@ $code.=<<___;

 	lg	$t0,0($inp)
 	lg	$t1,8($inp)
-	xg	$s0,128($sp)
-	xg	$s2,136($sp)
+	xg	$s0,16*$SIZE_T($sp)
+	xg	$s2,16*$SIZE_T+8($sp)
 	lghi	$s1,16
-	slgr	$len,$s1
+	sl${g}r	$len,$s1
 	brc	4,.Lcbc_dec_tail	# if borrow
 	brc	2,.Lcbc_dec_done	# if zero
 	stg	$s0,0($out)
 	stg	$s2,8($out)
-	stmg	$t0,$t1,128($sp)
+	stmg	$t0,$t1,16*$SIZE_T($sp)

 	la	$inp,16($inp)
 	la	$out,16($out)
@ -1326,7 +1348,7 @@ $code.=<<___;
 	stg	$s0,0($out)
 	stg	$s2,8($out)
 .Lcbc_dec_exit:
-	lmg	$ivp,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	stmg	$t0,$t1,0($ivp)

 	br	$ra
@ -1334,10 +1356,10 @@ $code.=<<___;
 .align	16
 .Lcbc_dec_tail:
 	aghi	$len,15
-	stg	$s0,128($sp)
-	stg	$s2,136($sp)
+	stg	$s0,16*$SIZE_T($sp)
+	stg	$s2,16*$SIZE_T+8($sp)
 	bras	$s1,4f
-	mvc	0(1,$out),128($sp)
+	mvc	0(1,$out),16*$SIZE_T($sp)
 4:	ex	$len,0($s1)
 	j	.Lcbc_dec_exit
 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
@ -1359,6 +1381,7 @@ $code.=<<___;
 .type	AES_ctr32_encrypt,\@function
 .align	16
 AES_ctr32_encrypt:
+	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
 ___
 $code.=<<___ if (!$softonly);
 	l	%r0,240($key)
@ -1366,7 +1389,7 @@ $code.=<<___ if (!$softonly);
 	clr	%r0,%r1
 	jl	.Lctr32_software

-	stmg	%r6,$s3,48($sp)
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)

 	slgr	$out,$inp
 	la	%r1,0($key)	# %r1 is permanent copy of $key
@ -1388,14 +1411,14 @@ $code.=<<___ if (!$softonly);

 	la	$sp,1024($s0)	# alloca
 	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
-	stg	$s2,0($sp)	# back-chain
-	stg	$fp,8($sp)
+	st${g}	$s2,0($sp)	# back-chain
+	st${g}	$fp,$SIZE_T($sp)

 	slgr	$len,$fp
 	brc	1,.Lctr32_hw_loop	# not zero, no borrow
 	algr	$fp,$len	# input is shorter than allocated buffer
 	lghi	$len,0
-	stg	$fp,8($sp)
+	st${g}	$fp,$SIZE_T($sp)

 .Lctr32_hw_loop:
 	la	$s2,16($sp)
@ -1432,8 +1455,8 @@ $code.=<<___ if (!$softonly);
 	lghi	$len,0
 	brc	4+1,.Lctr32_hw_loop	# not zero

-	lg	$s0,0($sp)
-	lg	$s1,8($sp)
+	l${g}	$s0,0($sp)
+	l${g}	$s1,$SIZE_T($sp)
 	la	$s2,16($sp)
 .Lctr32_hw_zap:
 	stg	$s0,0($s2)
@ -1442,30 +1465,30 @@ $code.=<<___ if (!$softonly);
 	brct	$s1,.Lctr32_hw_zap

 	la	$sp,0($s0)
-	lmg	%r6,$s3,48($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
 	br	$ra
 .align	16
 .Lctr32_software:
 ___
 $code.=<<___;
-	stmg	$key,$ra,40($sp)
-	slgr	$out,$inp
+	stm${g}	$key,$ra,5*$SIZE_T($sp)
+	sl${g}r	$out,$inp
 	larl	$tbl,AES_Te
 	llgf	$t1,12($ivp)

 .Lctr32_loop:
-	stmg	$inp,$len,16($sp)
+	stm${g}	$inp,$len,2*$SIZE_T($sp)
 	llgf	$s0,0($ivp)
 	llgf	$s1,4($ivp)
 	llgf	$s2,8($ivp)
 	lgr	$s3,$t1
-	st	$t1,128($sp)
+	st	$t1,16*$SIZE_T($sp)
 	lgr	%r4,$key

 	bras	$ra,_s390x_AES_encrypt

-	lmg	$inp,$ivp,16($sp)
-	llgf	$t1,128($sp)
+	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
+	llgf	$t1,16*$SIZE_T($sp)
 	x	$s0,0($inp)
 	x	$s1,4($inp)
 	x	$s2,8($inp)
@ -1479,7 +1502,7 @@ $code.=<<___;
 	ahi	$t1,1		# 32-bit increment
 	brct	$len,.Lctr32_loop

-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
 ___
--- a/crypto/bn/asm/s390x-mont.pl
+++ b/crypto/bn/asm/s390x-mont.pl
@ -32,9 +32,33 @@
 # Reschedule to minimize/avoid Address Generation Interlock hazard,
 # make inner loops counter-based.

+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better, less for
+# longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";

+$stdframe=16*$SIZE_T+4*8;
+
 $mn0="%r0";
 $num="%r1";

@ -63,34 +87,44 @@ $code.=<<___;
 .globl	bn_mul_mont
 .type	bn_mul_mont,\@function
 bn_mul_mont:
-	lgf	$num,164($sp)	# pull $num
-	sla	$num,3		# $num to enumerate bytes
+	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
+	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
 	la	$bp,0($num,$bp)

-	stg	%r2,16($sp)
+	st${g}	%r2,2*$SIZE_T($sp)

 	cghi	$num,16		#
 	lghi	%r2,0		#
 	blr	%r14		# if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	tmll	$num,4
+	bnzr	%r14		# if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
 	cghi	$num,128	#
 	bhr	%r14		# if($num>128) return 0;
+___
+$code.=<<___;
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)

-	stmg	%r3,%r15,24($sp)
-
-	lghi	$rp,-160-8	# leave room for carry bit
+	lghi	$rp,-$stdframe-8	# leave room for carry bit
 	lcgr	$j,$num		# -$num
 	lgr	%r0,$sp
 	la	$rp,0($rp,$sp)
 	la	$sp,0($j,$rp)	# alloca
-	stg	%r0,0($sp)	# back chain
+	st${g}	%r0,0($sp)	# back chain

 	sra	$num,3		# restore $num
 	la	$bp,0($j,$bp)	# restore $bp
 	ahi	$num,-1		# adjust $num for inner loop
 	lg	$n0,0($n0)	# pull n0
+	_dswap	$n0

 	lg	$bi,0($bp)
+	_dswap	$bi
 	lg	$alo,0($ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[0]
 	lgr	$AHI,$ahi

@ -98,6 +132,7 @@ bn_mul_mont:
 	msgr	$mn0,$n0

 	lg	$nlo,0($np)	#
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
@ -109,12 +144,14 @@ bn_mul_mont:
 .align	16
 .L1st:
 	lg	$alo,0($j,$ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[0]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi

 	lg	$nlo,0($j,$np)
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
@ -122,22 +159,24 @@ bn_mul_mont:
 	algr	$nlo,$alo
 	alcgr	$NHI,$nhi

-	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.L1st

 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI	# upmost overflow bit
-	stg	$NHI,160-8($j,$sp)
-	stg	$AHI,160($j,$sp)
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
 	la	$bp,8($bp)	# bp++

 .Louter:
 	lg	$bi,0($bp)	# bp[i]
+	_dswap	$bi
 	lg	$alo,0($ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[i]
-	alg	$alo,160($sp)	# +=tp[0]
+	alg	$alo,$stdframe($sp)	# +=tp[0]
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi

@ -145,6 +184,7 @@ bn_mul_mont:
 	msgr	$mn0,$n0	# tp[0]*n0

 	lg	$nlo,0($np)	# np[0]
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
@ -156,14 +196,16 @@ bn_mul_mont:
 .align	16
 .Linner:
 	lg	$alo,0($j,$ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[i]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$ahi,$AHI
-	alg	$alo,160($j,$sp)# +=tp[j]
+	alg	$alo,$stdframe($j,$sp)# +=tp[j]
 	alcgr	$AHI,$ahi

 	lg	$nlo,0($j,$np)
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
@ -171,31 +213,33 @@ bn_mul_mont:
 	algr	$nlo,$alo	# +="tp[j]"
 	alcgr	$NHI,$nhi

-	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.Linner

 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI
-	alg	$NHI,160($j,$sp)# accumulate previous upmost overflow bit
+	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
 	lghi	$ahi,0
 	alcgr	$AHI,$ahi	# new upmost overflow bit
-	stg	$NHI,160-8($j,$sp)
-	stg	$AHI,160($j,$sp)
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)

 	la	$bp,8($bp)	# bp++
-	clg	$bp,160+8+32($j,$sp)	# compare to &bp[num]
+	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
 	jne	.Louter

-	lg	$rp,160+8+16($j,$sp)	# reincarnate rp
-	la	$ap,160($sp)
+	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
+	la	$ap,$stdframe($sp)
 	ahi	$num,1		# restore $num, incidentally clears "borrow"

 	la	$j,0(%r0)
 	lr	$count,$num
 .Lsub:	lg	$alo,0($j,$ap)
-	slbg	$alo,0($j,$np)
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	slbgr	$alo,$nlo
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lsub
@ -210,19 +254,24 @@ bn_mul_mont:

 	la	$j,0(%r0)
 	lgr	$count,$num
-.Lcopy:	lg	$alo,0($j,$ap)	# copy or in-place refresh
-	stg	$j,160($j,$sp)	# zap tp
+.Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
+	_dswap	$alo
+	stg	$j,$stdframe($j,$sp)	# zap tp
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lcopy

-	la	%r1,160+8+48($j,$sp)
-	lmg	%r6,%r15,0(%r1)
+	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
+	lm${g}	%r6,%r15,0(%r1)
 	lghi	%r2,1		# signal "processed"
 	br	%r14
 .size	bn_mul_mont,.-bn_mul_mont
 .string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 ___

-print $code;
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+	print $_,"\n";
+}
 close STDOUT;
--- a/crypto/modes/asm/ghash-s390x.pl
+++ b/crypto/modes/asm/ghash-s390x.pl
@ -18,6 +18,26 @@
 # and the result should be close to 12. In the lack of instruction-
 # level profiling data it's impossible to tell why...

+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";

@ -74,7 +94,7 @@ $code.=<<___ if(!$softonly);
 .Lsoft_gmult:
 ___
 $code.=<<___;
-	stmg	%r6,%r14,48($sp)
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)

 	aghi	$Xi,-1
 	lghi	$len,1
@ -109,8 +129,11 @@ $code.=<<___ if(!$softonly);
 .align	32
 .Lsoft_ghash:
 ___
+$cdoe.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
 $code.=<<___;
-	stmg	%r6,%r14,48($sp)
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)

 	aghi	$Xi,-1
 	srlg	$len,$len,4
@ -209,7 +232,7 @@ $code.=<<___;
 	xgr	$Zhi,$tmp
 	stg	$Zlo,8+1($Xi)
 	stg	$Zhi,0+1($Xi)
-	lmg	%r6,%r14,48($sp)
+	lm${g}	%r6,%r14,6*$SIZE_T($sp)
 	br	%r14
 .type	gcm_ghash_4bit,\@function
 .size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
--- a/crypto/rc4/asm/rc4-s390x.pl
+++ b/crypto/rc4/asm/rc4-s390x.pl
@ -13,6 +13,26 @@
 # "cluster" Address Generation Interlocks, so that one pipeline stall
 # resolves several dependencies.

+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 50% better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";

@ -42,7 +62,12 @@ $code.=<<___;
 .type	RC4,\@function
 .align	64
 RC4:
-	stmg	%r6,%r11,48($sp)
+	stm${g}	%r6,%r11,6*$SIZE_T($sp)
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
+$code.=<<___;
 	llgc	$XX[0],0($key)
 	llgc	$YY,1($key)
 	la	$XX[0],1($XX[0])
@ -93,7 +118,7 @@ $code.=<<___;
 	xgr	$acc,$TX[1]
 	stg	$acc,0($out)
 	la	$out,8($out)
-	brct	$cnt,.Loop8
+	brctg	$cnt,.Loop8

 .Lshort:
 	lghi	$acc,7
@ -125,7 +150,7 @@ $code.=<<___;
 	ahi	$XX[0],-1
 	stc	$XX[0],0($key)
 	stc	$YY,1($key)
-	lmg	%r6,%r11,48($sp)
+	lm${g}	%r6,%r11,6*$SIZE_T($sp)
 	br	$rp
 .size	RC4,.-RC4
 .string	"RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
@ -150,7 +175,7 @@ $code.=<<___;
 .type	RC4_set_key,\@function
 .align	64
 RC4_set_key:
-	stmg	%r6,%r8,48($sp)
+	stm${g}	%r6,%r8,6*$SIZE_T($sp)
 	lhi	$cnt,256
 	la	$idx,0(%r0)
 	sth	$idx,0($key)
@ -183,7 +208,7 @@ RC4_set_key:
 	la	$iinp,0(%r0)
 	j	.L2ndloop
 .Ldone:
-	lmg	%r6,%r8,48($sp)
+	lm${g}	%r6,%r8,6*$SIZE_T($sp)
 	br	$rp
 .size	RC4_set_key,.-RC4_set_key

--- a/crypto/s390xcpuid.S
+++ b/crypto/s390xcpuid.S
@ -62,6 +62,9 @@ OPENSSL_wipe_cpu:
 .type	OPENSSL_cleanse,@function
 .align	16
 OPENSSL_cleanse:
+#if !defined(__s390x__) && !defined(__s390x)
+	llgfr	%r3,%r3
+#endif
 	lghi	%r4,15
 	lghi	%r0,0
 	clgr	%r3,%r4
--- a/crypto/sha/asm/sha1-s390x.pl
+++ b/crypto/sha/asm/sha1-s390x.pl
@ -21,8 +21,27 @@
 # instructions to favour dual-issue z10 pipeline. On z10 hardware is
 # "only" ~2.3x faster than software.

+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific.
+
 $kimdfunc=1;	# magic function code for kimd instruction

+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";

@ -42,13 +61,14 @@ $t1="%r11";
@X=("%r12","%r13","%r14");
 $sp="%r15";

-$frame=160+16*4;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*4;

 sub Xupdate {
 my $i=shift;

 $code.=<<___ if ($i==15);
-	lg	$prefetch,160($sp)	### Xupdate(16) warm-up
+	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
 	lr	$X[0],$X[2]
 ___
 return if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
@ -58,8 +78,8 @@ $code.=<<___ if ($i<16);
 ___
 $code.=<<___ if ($i>=16);
 	xgr	$X[0],$prefetch		### Xupdate($i)
-	lg	$prefetch,`160+4*(($i+2)%16)`($sp)
-	xg	$X[0],`160+4*(($i+8)%16)`($sp)
+	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
+	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
 	xgr	$X[0],$prefetch
 	rll	$X[0],$X[0],1
 	rllg	$X[1],$X[0],32
@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16);
 	lr	$X[2],$X[1]		# feedback
 ___
 $code.=<<___ if ($i<=70);
-	stg	$X[0],`160+4*($i%16)`($sp)
+	stg	$X[0],`$stdframe+4*($i%16)`($sp)
 ___
 unshift(@X,pop(@X));
 }
@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc);
 	tmhl	%r0,0x4000	# check for message-security assist
 	jz	.Lsoftware
 	lghi	%r0,0
-	la	%r1,16($sp)
+	la	%r1,`2*$SIZE_T`($sp)
 	.long	0xb93e0002	# kimd %r0,%r2
-	lg	%r0,16($sp)
+	lg	%r0,`2*$SIZE_T`($sp)
 	tmhh	%r0,`0x8000>>$kimdfunc`
 	jz	.Lsoftware
 	lghi	%r0,$kimdfunc
@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc);
 ___
 $code.=<<___;
 	lghi	%r1,-$frame
-	stg	$ctx,16($sp)
-	stmg	%r6,%r15,48($sp)
+	st${g}	$ctx,`2*$SIZE_T`($sp)
+	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
 	lgr	%r0,$sp
 	la	$sp,0(%r1,$sp)
-	stg	%r0,0($sp)
+	st${g}	%r0,0($sp)

 	larl	$t0,Ktable
 	llgf	$A,0($ctx)
@ -199,7 +219,7 @@ ___
 for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;

-	lg	$ctx,`$frame+16`($sp)
+	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
 	la	$inp,64($inp)
 	al	$A,0($ctx)
 	al	$B,4($ctx)
@ -211,9 +231,9 @@ $code.=<<___;
 	st	$C,8($ctx)
 	st	$D,12($ctx)
 	st	$E,16($ctx)
-	brct	$len,.Lloop
+	brct${g} $len,.Lloop

-	lmg	%r6,%r15,`$frame+48`($sp)
+	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
 	br	%r14
 .size	sha1_block_data_order,.-sha1_block_data_order
 .string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
--- a/crypto/sha/asm/sha512-s390x.pl
+++ b/crypto/sha/asm/sha512-s390x.pl
@ -26,6 +26,26 @@
 # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
 # than software.

+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z900 SHA256 was measured to
+# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
 $t0="%r0";
 $t1="%r1";
 $ctx="%r2";	$t2="%r2";
@ -78,7 +98,8 @@ if ($output =~ /512/) {
 }
 $Func="sha${label}_block_data_order";
 $Table="K${label}";
-$frame=160+16*$SZ;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*$SZ;

 sub BODY_00_15 {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
@ -93,9 +114,9 @@ $code.=<<___;
 	xgr	$t0,$t1
 	$ROT	$t1,$t1,`$Sigma1[2]-$Sigma1[1]`
 	 xgr	$t2,$g
-	$ST	$T1,`160+$SZ*($i%16)`($sp)
+	$ST	$T1,`$stdframe+$SZ*($i%16)`($sp)
 	xgr	$t0,$t1			# Sigma1(e)
-	la	$T1,0($T1,$h)		# T1+=h
+	algr	$T1,$h			# T1+=h
 	 ngr	$t2,$e
 	 lgr	$t1,$a
 	algr	$T1,$t0			# T1+=Sigma1(e)
@ -113,7 +134,7 @@ $code.=<<___;
 	 ngr	$t2,$b
 	algr	$h,$T1			# h+=T1
 	 ogr	$t2,$t1			# Maj(a,b,c)
-	la	$d,0($d,$T1)		# d+=T1
+	algr	$d,$T1			# d+=T1
 	algr	$h,$t2			# h+=Maj(a,b,c)
 ___
 }
@ -122,19 +143,19 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;

 $code.=<<___;
-	$LD	$T1,`160+$SZ*(($i+1)%16)`($sp)	### $i
-	$LD	$t1,`160+$SZ*(($i+14)%16)`($sp)
+	$LD	$T1,`$stdframe+$SZ*(($i+1)%16)`($sp)	### $i
+	$LD	$t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
 	$ROT	$t0,$T1,$sigma0[0]
 	$SHR	$T1,$sigma0[2]
 	$ROT	$t2,$t0,`$sigma0[1]-$sigma0[0]`
 	xgr	$T1,$t0
 	$ROT	$t0,$t1,$sigma1[0]
-	xgr	$T1,$t2				# sigma0(X[i+1])
+	xgr	$T1,$t2					# sigma0(X[i+1])
 	$SHR	$t1,$sigma1[2]
-	$ADD	$T1,`160+$SZ*($i%16)`($sp)	# +=X[i]
+	$ADD	$T1,`$stdframe+$SZ*($i%16)`($sp)	# +=X[i]
 	xgr	$t1,$t0
 	$ROT	$t0,$t0,`$sigma1[1]-$sigma1[0]`
-	$ADD	$T1,`160+$SZ*(($i+9)%16)`($sp)	# +=X[i+9]
+	$ADD	$T1,`$stdframe+$SZ*(($i+9)%16)`($sp)	# +=X[i+9]
 	xgr	$t1,$t0				# sigma1(X[i+14])
 	algr	$T1,$t1				# +=sigma1(X[i+14])
 ___
@ -212,6 +233,7 @@ $code.=<<___;
 .globl	$Func
 .type	$Func,\@function
 $Func:
+	sllg	$len,$len,`log(16*$SZ)/log(2)`
 ___
 $code.=<<___ if ($kimdfunc);
 	larl	%r1,OPENSSL_s390xcap_P
@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc);
 	tmhl	%r0,0x4000	# check for message-security assist
 	jz	.Lsoftware
 	lghi	%r0,0
-	la	%r1,16($sp)
+	la	%r1,`2*$SIZE_T`($sp)
 	.long	0xb93e0002	# kimd %r0,%r2
-	lg	%r0,16($sp)
+	lg	%r0,`2*$SIZE_T`($sp)
 	tmhh	%r0,`0x8000>>$kimdfunc`
 	jz	.Lsoftware
 	lghi	%r0,$kimdfunc
 	lgr	%r1,$ctx
 	lgr	%r2,$inp
-	sllg	%r3,$len,`log(16*$SZ)/log(2)`
+	lgr	%r3,$len
 	.long	0xb93e0002	# kimd %r0,%r2
 	brc	1,.-4		# pay attention to "partial completion"
 	br	%r14
@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc);
 .Lsoftware:
 ___
 $code.=<<___;
-	sllg	$len,$len,`log(16*$SZ)/log(2)`
 	lghi	%r1,-$frame
-	agr	$len,$inp
-	stmg	$ctx,%r15,16($sp)
+	la	$len,0($len,$inp)
+	stm${g}	$ctx,%r15,`2*$SIZE_T`($sp)
 	lgr	%r0,$sp
 	la	$sp,0(%r1,$sp)
-	stg	%r0,0($sp)
+	st${g}	%r0,0($sp)

 	larl	$tbl,$Table
 	$LD	$A,`0*$SZ`($ctx)
@ -265,7 +286,7 @@ $code.=<<___;
 	clgr	$len,$t0
 	jne	.Lrounds_16_xx

-	lg	$ctx,`$frame+16`($sp)
+	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
 	la	$inp,`16*$SZ`($inp)
 	$ADD	$A,`0*$SZ`($ctx)
 	$ADD	$B,`1*$SZ`($ctx)
@ -283,10 +304,10 @@ $code.=<<___;
 	$ST	$F,`5*$SZ`($ctx)
 	$ST	$G,`6*$SZ`($ctx)
 	$ST	$H,`7*$SZ`($ctx)
-	clg	$inp,`$frame+32`($sp)
+	cl${g}	$inp,`$frame+4*$SIZE_T`($sp)
 	jne	.Lloop

-	lmg	%r6,%r15,`$frame+48`($sp)	
+	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)	
 	br	%r14
 .size	$Func,.-$Func
 .string	"SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"