bc4e831ccd
Extend the s390x capability vector to store the longer facility list available from z13 onwards. The bits indicating the vector extensions are set to zero, if the kernel does not enable the vector facility. Also add capability bits returned by the crypto instructions' query functions. Signed-off-by: Patrick Steuer <patrick.steuer@de.ibm.com> Reviewed-by: Andy Polyakov <appro@openssl.org> Reviewed-by: Tim Hudson <tjh@openssl.org> (Merged from https://github.com/openssl/openssl/pull/4542)
2228 lines
52 KiB
Perl
2228 lines
52 KiB
Perl
#! /usr/bin/env perl
|
|
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the OpenSSL license (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
|
|
# AES for s390x.
|
|
|
|
# April 2007.
|
|
#
|
|
# Software performance improvement over gcc-generated code is ~70% and
|
|
# in absolute terms is ~73 cycles per byte processed with 128-bit key.
|
|
# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
|
|
# *strictly* in-order execution and issued instruction [in this case
|
|
# load value from memory is critical] has to complete before execution
|
|
# flow proceeds. S-boxes are compressed to 2KB[+256B].
|
|
#
|
|
# As for hardware acceleration support. It's basically a "teaser," as
|
|
# it can and should be improved in several ways. Most notably support
|
|
# for CBC is not utilized, nor multiple blocks are ever processed.
|
|
# Then software key schedule can be postponed till hardware support
|
|
# detection... Performance improvement over assembler is reportedly
|
|
# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
|
|
# support is implemented.
|
|
|
|
# May 2007.
|
|
#
|
|
# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
|
|
# for 128-bit keys, if hardware support is detected.
|
|
|
|
# Januray 2009.
|
|
#
|
|
# Add support for hardware AES192/256 and reschedule instructions to
|
|
# minimize/avoid Address Generation Interlock hazard and to favour
|
|
# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
|
|
# almost 50% on z9. The gain is smaller on z10, because being dual-
|
|
# issue z10 makes it improssible to eliminate the interlock condition:
|
|
# critial path is not long enough. Yet it spends ~24 cycles per byte
|
|
# processed with 128-bit key.
|
|
#
|
|
# Unlike previous version hardware support detection takes place only
|
|
# at the moment of key schedule setup, which is denoted in key->rounds.
|
|
# This is done, because deferred key setup can't be made MT-safe, not
|
|
# for keys longer than 128 bits.
|
|
#
|
|
# Add AES_cbc_encrypt, which gives incredible performance improvement,
|
|
# it was measured to be ~6.6x. It's less than previously mentioned 8x,
|
|
# because software implementation was optimized.
|
|
|
|
# May 2010.
|
|
#
|
|
# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
|
|
# performance improvement over "generic" counter mode routine relying
|
|
# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
|
|
# to the fact that exact throughput value depends on current stack
|
|
# frame alignment within 4KB page. In worst case you get ~75% of the
|
|
# maximum, but *on average* it would be as much as ~98%. Meaning that
|
|
# worst case is unlike, it's like hitting ravine on plateau.
|
|
|
|
# November 2010.
|
|
#
|
|
# Adapt for -m31 build. If kernel supports what's called "highgprs"
|
|
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
|
|
# instructions and achieve "64-bit" performance even in 31-bit legacy
|
|
# application context. The feature is not specific to any particular
|
|
# processor, as long as it's "z-CPU". Latter implies that the code
|
|
# remains z/Architecture specific. On z990 it was measured to perform
|
|
# 2x better than code generated by gcc 4.3.
|
|
|
|
# December 2010.
|
|
#
|
|
# Add support for z196 "cipher message with counter" instruction.
|
|
# Note however that it's disengaged, because it was measured to
|
|
# perform ~12% worse than vanilla km-based code...
|
|
|
|
# February 2011.
|
|
#
|
|
# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
|
|
# instructions, which deliver ~70% improvement at 8KB block size over
|
|
# vanilla km-based code, 37% - at most like 512-bytes block size.
|
|
|
|
$flavour = shift;
|
|
|
|
if ($flavour =~ /3[12]/) {
|
|
$SIZE_T=4;
|
|
$g="";
|
|
} else {
|
|
$SIZE_T=8;
|
|
$g="g";
|
|
}
|
|
|
|
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
|
open STDOUT,">$output";
|
|
|
|
$softonly=0; # allow hardware support
|
|
|
|
$t0="%r0"; $mask="%r0";
|
|
$t1="%r1";
|
|
$t2="%r2"; $inp="%r2";
|
|
$t3="%r3"; $out="%r3"; $bits="%r3";
|
|
$key="%r4";
|
|
$i1="%r5";
|
|
$i2="%r6";
|
|
$i3="%r7";
|
|
$s0="%r8";
|
|
$s1="%r9";
|
|
$s2="%r10";
|
|
$s3="%r11";
|
|
$tbl="%r12";
|
|
$rounds="%r13";
|
|
$ra="%r14";
|
|
$sp="%r15";
|
|
|
|
$stdframe=16*$SIZE_T+4*8;
|
|
|
|
sub _data_word()
|
|
{ my $i;
|
|
while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
|
|
}
|
|
|
|
$code=<<___;
|
|
#include "s390x_arch.h"
|
|
|
|
.text
|
|
|
|
.type AES_Te,\@object
|
|
.align 256
|
|
AES_Te:
|
|
___
|
|
&_data_word(
|
|
0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
|
|
0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
|
|
0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
|
|
0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
|
|
0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
|
|
0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
|
|
0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
|
|
0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
|
|
0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
|
|
0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
|
|
0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
|
|
0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
|
|
0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
|
|
0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
|
|
0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
|
|
0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
|
|
0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
|
|
0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
|
|
0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
|
|
0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
|
|
0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
|
|
0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
|
|
0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
|
|
0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
|
|
0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
|
|
0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
|
|
0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
|
|
0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
|
|
0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
|
|
0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
|
|
0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
|
|
0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
|
|
0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
|
|
0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
|
|
0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
|
|
0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
|
|
0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
|
|
0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
|
|
0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
|
|
0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
|
|
0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
|
|
0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
|
|
0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
|
|
0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
|
|
0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
|
|
0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
|
|
0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
|
|
0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
|
|
0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
|
|
0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
|
|
0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
|
|
0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
|
|
0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
|
|
0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
|
|
0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
|
|
0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
|
|
0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
|
|
0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
|
|
0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
|
|
0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
|
|
0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
|
|
0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
|
|
0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
|
|
0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
|
|
$code.=<<___;
|
|
# Te4[256]
|
|
.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
|
|
.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
|
|
.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
|
|
.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
|
|
.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
|
|
.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
|
|
.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
|
|
.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
|
|
.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
|
|
.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
|
|
.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
|
|
.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
|
|
.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
|
|
.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
|
|
.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
|
|
.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
|
|
.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
|
|
.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
|
|
.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
|
|
.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
|
|
.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
|
|
.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
|
|
.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
|
|
.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
|
|
.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
|
|
.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
|
|
.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
|
|
.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
|
|
.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
|
|
.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
|
|
.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
|
|
.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
|
|
# rcon[]
|
|
.long 0x01000000, 0x02000000, 0x04000000, 0x08000000
|
|
.long 0x10000000, 0x20000000, 0x40000000, 0x80000000
|
|
.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
|
|
.align 256
|
|
.size AES_Te,.-AES_Te
|
|
|
|
# void AES_encrypt(const unsigned char *inp, unsigned char *out,
|
|
# const AES_KEY *key) {
|
|
.globl AES_encrypt
|
|
.type AES_encrypt,\@function
|
|
AES_encrypt:
|
|
___
|
|
$code.=<<___ if (!$softonly);
|
|
l %r0,240($key)
|
|
lhi %r1,16
|
|
clr %r0,%r1
|
|
jl .Lesoft
|
|
|
|
la %r1,0($key)
|
|
#la %r2,0($inp)
|
|
la %r4,0($out)
|
|
lghi %r3,16 # single block length
|
|
.long 0xb92e0042 # km %r4,%r2
|
|
brc 1,.-4 # can this happen?
|
|
br %r14
|
|
.align 64
|
|
.Lesoft:
|
|
___
|
|
$code.=<<___;
|
|
stm${g} %r3,$ra,3*$SIZE_T($sp)
|
|
|
|
llgf $s0,0($inp)
|
|
llgf $s1,4($inp)
|
|
llgf $s2,8($inp)
|
|
llgf $s3,12($inp)
|
|
|
|
larl $tbl,AES_Te
|
|
bras $ra,_s390x_AES_encrypt
|
|
|
|
l${g} $out,3*$SIZE_T($sp)
|
|
st $s0,0($out)
|
|
st $s1,4($out)
|
|
st $s2,8($out)
|
|
st $s3,12($out)
|
|
|
|
lm${g} %r6,$ra,6*$SIZE_T($sp)
|
|
br $ra
|
|
.size AES_encrypt,.-AES_encrypt
|
|
|
|
.type _s390x_AES_encrypt,\@function
|
|
.align 16
|
|
_s390x_AES_encrypt:
|
|
st${g} $ra,15*$SIZE_T($sp)
|
|
x $s0,0($key)
|
|
x $s1,4($key)
|
|
x $s2,8($key)
|
|
x $s3,12($key)
|
|
l $rounds,240($key)
|
|
llill $mask,`0xff<<3`
|
|
aghi $rounds,-1
|
|
j .Lenc_loop
|
|
.align 16
|
|
.Lenc_loop:
|
|
sllg $t1,$s0,`0+3`
|
|
srlg $t2,$s0,`8-3`
|
|
srlg $t3,$s0,`16-3`
|
|
srl $s0,`24-3`
|
|
nr $s0,$mask
|
|
ngr $t1,$mask
|
|
nr $t2,$mask
|
|
nr $t3,$mask
|
|
|
|
srlg $i1,$s1,`16-3` # i0
|
|
sllg $i2,$s1,`0+3`
|
|
srlg $i3,$s1,`8-3`
|
|
srl $s1,`24-3`
|
|
nr $i1,$mask
|
|
nr $s1,$mask
|
|
ngr $i2,$mask
|
|
nr $i3,$mask
|
|
|
|
l $s0,0($s0,$tbl) # Te0[s0>>24]
|
|
l $t1,1($t1,$tbl) # Te3[s0>>0]
|
|
l $t2,2($t2,$tbl) # Te2[s0>>8]
|
|
l $t3,3($t3,$tbl) # Te1[s0>>16]
|
|
|
|
x $s0,3($i1,$tbl) # Te1[s1>>16]
|
|
l $s1,0($s1,$tbl) # Te0[s1>>24]
|
|
x $t2,1($i2,$tbl) # Te3[s1>>0]
|
|
x $t3,2($i3,$tbl) # Te2[s1>>8]
|
|
|
|
srlg $i1,$s2,`8-3` # i0
|
|
srlg $i2,$s2,`16-3` # i1
|
|
nr $i1,$mask
|
|
nr $i2,$mask
|
|
sllg $i3,$s2,`0+3`
|
|
srl $s2,`24-3`
|
|
nr $s2,$mask
|
|
ngr $i3,$mask
|
|
|
|
xr $s1,$t1
|
|
srlg $ra,$s3,`8-3` # i1
|
|
sllg $t1,$s3,`0+3` # i0
|
|
nr $ra,$mask
|
|
la $key,16($key)
|
|
ngr $t1,$mask
|
|
|
|
x $s0,2($i1,$tbl) # Te2[s2>>8]
|
|
x $s1,3($i2,$tbl) # Te1[s2>>16]
|
|
l $s2,0($s2,$tbl) # Te0[s2>>24]
|
|
x $t3,1($i3,$tbl) # Te3[s2>>0]
|
|
|
|
srlg $i3,$s3,`16-3` # i2
|
|
xr $s2,$t2
|
|
srl $s3,`24-3`
|
|
nr $i3,$mask
|
|
nr $s3,$mask
|
|
|
|
x $s0,0($key)
|
|
x $s1,4($key)
|
|
x $s2,8($key)
|
|
x $t3,12($key)
|
|
|
|
x $s0,1($t1,$tbl) # Te3[s3>>0]
|
|
x $s1,2($ra,$tbl) # Te2[s3>>8]
|
|
x $s2,3($i3,$tbl) # Te1[s3>>16]
|
|
l $s3,0($s3,$tbl) # Te0[s3>>24]
|
|
xr $s3,$t3
|
|
|
|
brct $rounds,.Lenc_loop
|
|
.align 16
|
|
|
|
sllg $t1,$s0,`0+3`
|
|
srlg $t2,$s0,`8-3`
|
|
ngr $t1,$mask
|
|
srlg $t3,$s0,`16-3`
|
|
srl $s0,`24-3`
|
|
nr $s0,$mask
|
|
nr $t2,$mask
|
|
nr $t3,$mask
|
|
|
|
srlg $i1,$s1,`16-3` # i0
|
|
sllg $i2,$s1,`0+3`
|
|
ngr $i2,$mask
|
|
srlg $i3,$s1,`8-3`
|
|
srl $s1,`24-3`
|
|
nr $i1,$mask
|
|
nr $s1,$mask
|
|
nr $i3,$mask
|
|
|
|
llgc $s0,2($s0,$tbl) # Te4[s0>>24]
|
|
llgc $t1,2($t1,$tbl) # Te4[s0>>0]
|
|
sll $s0,24
|
|
llgc $t2,2($t2,$tbl) # Te4[s0>>8]
|
|
llgc $t3,2($t3,$tbl) # Te4[s0>>16]
|
|
sll $t2,8
|
|
sll $t3,16
|
|
|
|
llgc $i1,2($i1,$tbl) # Te4[s1>>16]
|
|
llgc $s1,2($s1,$tbl) # Te4[s1>>24]
|
|
llgc $i2,2($i2,$tbl) # Te4[s1>>0]
|
|
llgc $i3,2($i3,$tbl) # Te4[s1>>8]
|
|
sll $i1,16
|
|
sll $s1,24
|
|
sll $i3,8
|
|
or $s0,$i1
|
|
or $s1,$t1
|
|
or $t2,$i2
|
|
or $t3,$i3
|
|
|
|
srlg $i1,$s2,`8-3` # i0
|
|
srlg $i2,$s2,`16-3` # i1
|
|
nr $i1,$mask
|
|
nr $i2,$mask
|
|
sllg $i3,$s2,`0+3`
|
|
srl $s2,`24-3`
|
|
ngr $i3,$mask
|
|
nr $s2,$mask
|
|
|
|
sllg $t1,$s3,`0+3` # i0
|
|
srlg $ra,$s3,`8-3` # i1
|
|
ngr $t1,$mask
|
|
|
|
llgc $i1,2($i1,$tbl) # Te4[s2>>8]
|
|
llgc $i2,2($i2,$tbl) # Te4[s2>>16]
|
|
sll $i1,8
|
|
llgc $s2,2($s2,$tbl) # Te4[s2>>24]
|
|
llgc $i3,2($i3,$tbl) # Te4[s2>>0]
|
|
sll $i2,16
|
|
nr $ra,$mask
|
|
sll $s2,24
|
|
or $s0,$i1
|
|
or $s1,$i2
|
|
or $s2,$t2
|
|
or $t3,$i3
|
|
|
|
srlg $i3,$s3,`16-3` # i2
|
|
srl $s3,`24-3`
|
|
nr $i3,$mask
|
|
nr $s3,$mask
|
|
|
|
l $t0,16($key)
|
|
l $t2,20($key)
|
|
|
|
llgc $i1,2($t1,$tbl) # Te4[s3>>0]
|
|
llgc $i2,2($ra,$tbl) # Te4[s3>>8]
|
|
llgc $i3,2($i3,$tbl) # Te4[s3>>16]
|
|
llgc $s3,2($s3,$tbl) # Te4[s3>>24]
|
|
sll $i2,8
|
|
sll $i3,16
|
|
sll $s3,24
|
|
or $s0,$i1
|
|
or $s1,$i2
|
|
or $s2,$i3
|
|
or $s3,$t3
|
|
|
|
l${g} $ra,15*$SIZE_T($sp)
|
|
xr $s0,$t0
|
|
xr $s1,$t2
|
|
x $s2,24($key)
|
|
x $s3,28($key)
|
|
|
|
br $ra
|
|
.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
|
|
___
|
|
|
|
$code.=<<___;
|
|
.type AES_Td,\@object
|
|
.align 256
|
|
AES_Td:
|
|
___
|
|
&_data_word(
|
|
0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
|
|
0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
|
|
0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
|
|
0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
|
|
0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
|
|
0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
|
|
0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
|
|
0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
|
|
0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
|
|
0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
|
|
0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
|
|
0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
|
|
0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
|
|
0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
|
|
0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
|
|
0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
|
|
0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
|
|
0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
|
|
0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
|
|
0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
|
|
0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
|
|
0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
|
|
0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
|
|
0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
|
|
0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
|
|
0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
|
|
0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
|
|
0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
|
|
0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
|
|
0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
|
|
0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
|
|
0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
|
|
0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
|
|
0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
|
|
0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
|
|
0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
|
|
0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
|
|
0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
|
|
0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
|
|
0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
|
|
0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
|
|
0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
|
|
0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
|
|
0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
|
|
0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
|
|
0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
|
|
0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
|
|
0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
|
|
0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
|
|
0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
|
|
0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
|
|
0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
|
|
0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
|
|
0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
|
|
0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
|
|
0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
|
|
0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
|
|
0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
|
|
0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
|
|
0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
|
|
0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
|
|
0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
|
|
0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
|
|
0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
|
|
$code.=<<___;
|
|
# Td4[256]
|
|
.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
|
|
.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
|
|
.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
|
|
.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
|
|
.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
|
|
.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
|
|
.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
|
|
.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
|
|
.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
|
|
.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
|
|
.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
|
|
.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
|
|
.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
|
|
.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
|
|
.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
|
|
.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
|
|
.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
|
|
.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
|
|
.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
|
|
.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
|
|
.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
|
|
.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
|
|
.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
|
|
.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
|
|
.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
|
|
.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
|
|
.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
|
|
.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
|
|
.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
|
|
.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
|
|
.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
|
|
.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
|
|
.size AES_Td,.-AES_Td
|
|
|
|
# void AES_decrypt(const unsigned char *inp, unsigned char *out,
|
|
# const AES_KEY *key) {
|
|
.globl AES_decrypt
|
|
.type AES_decrypt,\@function
|
|
AES_decrypt:
|
|
___
|
|
$code.=<<___ if (!$softonly);
|
|
l %r0,240($key)
|
|
lhi %r1,16
|
|
clr %r0,%r1
|
|
jl .Ldsoft
|
|
|
|
la %r1,0($key)
|
|
#la %r2,0($inp)
|
|
la %r4,0($out)
|
|
lghi %r3,16 # single block length
|
|
.long 0xb92e0042 # km %r4,%r2
|
|
brc 1,.-4 # can this happen?
|
|
br %r14
|
|
.align 64
|
|
.Ldsoft:
|
|
___
|
|
$code.=<<___;
|
|
stm${g} %r3,$ra,3*$SIZE_T($sp)
|
|
|
|
llgf $s0,0($inp)
|
|
llgf $s1,4($inp)
|
|
llgf $s2,8($inp)
|
|
llgf $s3,12($inp)
|
|
|
|
larl $tbl,AES_Td
|
|
bras $ra,_s390x_AES_decrypt
|
|
|
|
l${g} $out,3*$SIZE_T($sp)
|
|
st $s0,0($out)
|
|
st $s1,4($out)
|
|
st $s2,8($out)
|
|
st $s3,12($out)
|
|
|
|
lm${g} %r6,$ra,6*$SIZE_T($sp)
|
|
br $ra
|
|
.size AES_decrypt,.-AES_decrypt
|
|
|
|
.type _s390x_AES_decrypt,\@function
|
|
.align 16
|
|
_s390x_AES_decrypt:
|
|
st${g} $ra,15*$SIZE_T($sp)
|
|
x $s0,0($key)
|
|
x $s1,4($key)
|
|
x $s2,8($key)
|
|
x $s3,12($key)
|
|
l $rounds,240($key)
|
|
llill $mask,`0xff<<3`
|
|
aghi $rounds,-1
|
|
j .Ldec_loop
|
|
.align 16
|
|
.Ldec_loop:
|
|
srlg $t1,$s0,`16-3`
|
|
srlg $t2,$s0,`8-3`
|
|
sllg $t3,$s0,`0+3`
|
|
srl $s0,`24-3`
|
|
nr $s0,$mask
|
|
nr $t1,$mask
|
|
nr $t2,$mask
|
|
ngr $t3,$mask
|
|
|
|
sllg $i1,$s1,`0+3` # i0
|
|
srlg $i2,$s1,`16-3`
|
|
srlg $i3,$s1,`8-3`
|
|
srl $s1,`24-3`
|
|
ngr $i1,$mask
|
|
nr $s1,$mask
|
|
nr $i2,$mask
|
|
nr $i3,$mask
|
|
|
|
l $s0,0($s0,$tbl) # Td0[s0>>24]
|
|
l $t1,3($t1,$tbl) # Td1[s0>>16]
|
|
l $t2,2($t2,$tbl) # Td2[s0>>8]
|
|
l $t3,1($t3,$tbl) # Td3[s0>>0]
|
|
|
|
x $s0,1($i1,$tbl) # Td3[s1>>0]
|
|
l $s1,0($s1,$tbl) # Td0[s1>>24]
|
|
x $t2,3($i2,$tbl) # Td1[s1>>16]
|
|
x $t3,2($i3,$tbl) # Td2[s1>>8]
|
|
|
|
srlg $i1,$s2,`8-3` # i0
|
|
sllg $i2,$s2,`0+3` # i1
|
|
srlg $i3,$s2,`16-3`
|
|
srl $s2,`24-3`
|
|
nr $i1,$mask
|
|
ngr $i2,$mask
|
|
nr $s2,$mask
|
|
nr $i3,$mask
|
|
|
|
xr $s1,$t1
|
|
srlg $ra,$s3,`8-3` # i1
|
|
srlg $t1,$s3,`16-3` # i0
|
|
nr $ra,$mask
|
|
la $key,16($key)
|
|
nr $t1,$mask
|
|
|
|
x $s0,2($i1,$tbl) # Td2[s2>>8]
|
|
x $s1,1($i2,$tbl) # Td3[s2>>0]
|
|
l $s2,0($s2,$tbl) # Td0[s2>>24]
|
|
x $t3,3($i3,$tbl) # Td1[s2>>16]
|
|
|
|
sllg $i3,$s3,`0+3` # i2
|
|
srl $s3,`24-3`
|
|
ngr $i3,$mask
|
|
nr $s3,$mask
|
|
|
|
xr $s2,$t2
|
|
x $s0,0($key)
|
|
x $s1,4($key)
|
|
x $s2,8($key)
|
|
x $t3,12($key)
|
|
|
|
x $s0,3($t1,$tbl) # Td1[s3>>16]
|
|
x $s1,2($ra,$tbl) # Td2[s3>>8]
|
|
x $s2,1($i3,$tbl) # Td3[s3>>0]
|
|
l $s3,0($s3,$tbl) # Td0[s3>>24]
|
|
xr $s3,$t3
|
|
|
|
brct $rounds,.Ldec_loop
|
|
.align 16
|
|
|
|
l $t1,`2048+0`($tbl) # prefetch Td4
|
|
l $t2,`2048+64`($tbl)
|
|
l $t3,`2048+128`($tbl)
|
|
l $i1,`2048+192`($tbl)
|
|
llill $mask,0xff
|
|
|
|
srlg $i3,$s0,24 # i0
|
|
srlg $t1,$s0,16
|
|
srlg $t2,$s0,8
|
|
nr $s0,$mask # i3
|
|
nr $t1,$mask
|
|
|
|
srlg $i1,$s1,24
|
|
nr $t2,$mask
|
|
srlg $i2,$s1,16
|
|
srlg $ra,$s1,8
|
|
nr $s1,$mask # i0
|
|
nr $i2,$mask
|
|
nr $ra,$mask
|
|
|
|
llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
|
|
llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
|
|
llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
|
|
sll $t1,16
|
|
llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
|
|
sllg $s0,$i3,24
|
|
sll $t2,8
|
|
|
|
llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
|
|
llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
|
|
llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
|
|
sll $i1,24
|
|
llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
|
|
sll $i2,16
|
|
sll $i3,8
|
|
or $s0,$s1
|
|
or $t1,$i1
|
|
or $t2,$i2
|
|
or $t3,$i3
|
|
|
|
srlg $i1,$s2,8 # i0
|
|
srlg $i2,$s2,24
|
|
srlg $i3,$s2,16
|
|
nr $s2,$mask # i1
|
|
nr $i1,$mask
|
|
nr $i3,$mask
|
|
llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
|
|
llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
|
|
llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
|
|
llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
|
|
sll $i1,8
|
|
sll $i2,24
|
|
or $s0,$i1
|
|
sll $i3,16
|
|
or $t2,$i2
|
|
or $t3,$i3
|
|
|
|
srlg $i1,$s3,16 # i0
|
|
srlg $i2,$s3,8 # i1
|
|
srlg $i3,$s3,24
|
|
nr $s3,$mask # i2
|
|
nr $i1,$mask
|
|
nr $i2,$mask
|
|
|
|
l${g} $ra,15*$SIZE_T($sp)
|
|
or $s1,$t1
|
|
l $t0,16($key)
|
|
l $t1,20($key)
|
|
|
|
llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
|
|
llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
|
|
sll $i1,16
|
|
llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
|
|
llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
|
|
sll $i2,8
|
|
sll $s3,24
|
|
or $s0,$i1
|
|
or $s1,$i2
|
|
or $s2,$t2
|
|
or $s3,$t3
|
|
|
|
xr $s0,$t0
|
|
xr $s1,$t1
|
|
x $s2,24($key)
|
|
x $s3,28($key)
|
|
|
|
br $ra
|
|
.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
|
|
___
|
|
|
|
$code.=<<___;
|
|
# void AES_set_encrypt_key(const unsigned char *in, int bits,
|
|
# AES_KEY *key) {
|
|
.globl AES_set_encrypt_key
|
|
.type AES_set_encrypt_key,\@function
|
|
.align 16
|
|
AES_set_encrypt_key:
|
|
_s390x_AES_set_encrypt_key:
|
|
lghi $t0,0
|
|
cl${g}r $inp,$t0
|
|
je .Lminus1
|
|
cl${g}r $key,$t0
|
|
je .Lminus1
|
|
|
|
lghi $t0,128
|
|
clr $bits,$t0
|
|
je .Lproceed
|
|
lghi $t0,192
|
|
clr $bits,$t0
|
|
je .Lproceed
|
|
lghi $t0,256
|
|
clr $bits,$t0
|
|
je .Lproceed
|
|
lghi %r2,-2
|
|
br %r14
|
|
|
|
.align 16
|
|
.Lproceed:
|
|
___
|
|
$code.=<<___ if (!$softonly);
|
|
# convert bits to km(c) code, [128,192,256]->[18,19,20]
|
|
lhi %r5,-128
|
|
lhi %r0,18
|
|
ar %r5,$bits
|
|
srl %r5,6
|
|
ar %r5,%r0
|
|
|
|
larl %r1,OPENSSL_s390xcap_P
|
|
llihh %r0,0x8000
|
|
srlg %r0,%r0,0(%r5)
|
|
ng %r0,S390X_KM(%r1) # check availability of both km...
|
|
ng %r0,S390X_KMC(%r1) # ...and kmc support for given key length
|
|
jz .Lekey_internal
|
|
|
|
lmg %r0,%r1,0($inp) # just copy 128 bits...
|
|
stmg %r0,%r1,0($key)
|
|
lhi %r0,192
|
|
cr $bits,%r0
|
|
jl 1f
|
|
lg %r1,16($inp)
|
|
stg %r1,16($key)
|
|
je 1f
|
|
lg %r1,24($inp)
|
|
stg %r1,24($key)
|
|
1: st $bits,236($key) # save bits [for debugging purposes]
|
|
lgr $t0,%r5
|
|
st %r5,240($key) # save km(c) code
|
|
lghi %r2,0
|
|
br %r14
|
|
___
|
|
$code.=<<___;
|
|
.align 16
|
|
.Lekey_internal:
|
|
stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
|
|
|
|
larl $tbl,AES_Te+2048
|
|
|
|
llgf $s0,0($inp)
|
|
llgf $s1,4($inp)
|
|
llgf $s2,8($inp)
|
|
llgf $s3,12($inp)
|
|
st $s0,0($key)
|
|
st $s1,4($key)
|
|
st $s2,8($key)
|
|
st $s3,12($key)
|
|
lghi $t0,128
|
|
cr $bits,$t0
|
|
jne .Lnot128
|
|
|
|
llill $mask,0xff
|
|
lghi $t3,0 # i=0
|
|
lghi $rounds,10
|
|
st $rounds,240($key)
|
|
|
|
llgfr $t2,$s3 # temp=rk[3]
|
|
srlg $i1,$s3,8
|
|
srlg $i2,$s3,16
|
|
srlg $i3,$s3,24
|
|
nr $t2,$mask
|
|
nr $i1,$mask
|
|
nr $i2,$mask
|
|
|
|
.align 16
|
|
.L128_loop:
|
|
la $t2,0($t2,$tbl)
|
|
la $i1,0($i1,$tbl)
|
|
la $i2,0($i2,$tbl)
|
|
la $i3,0($i3,$tbl)
|
|
icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
|
|
icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
|
|
icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
|
|
icm $t2,1,0($i3) # Te4[rk[3]>>24]
|
|
x $t2,256($t3,$tbl) # rcon[i]
|
|
xr $s0,$t2 # rk[4]=rk[0]^...
|
|
xr $s1,$s0 # rk[5]=rk[1]^rk[4]
|
|
xr $s2,$s1 # rk[6]=rk[2]^rk[5]
|
|
xr $s3,$s2 # rk[7]=rk[3]^rk[6]
|
|
|
|
llgfr $t2,$s3 # temp=rk[3]
|
|
srlg $i1,$s3,8
|
|
srlg $i2,$s3,16
|
|
nr $t2,$mask
|
|
nr $i1,$mask
|
|
srlg $i3,$s3,24
|
|
nr $i2,$mask
|
|
|
|
st $s0,16($key)
|
|
st $s1,20($key)
|
|
st $s2,24($key)
|
|
st $s3,28($key)
|
|
la $key,16($key) # key+=4
|
|
la $t3,4($t3) # i++
|
|
brct $rounds,.L128_loop
|
|
lghi $t0,10
|
|
lghi %r2,0
|
|
lm${g} %r4,%r13,4*$SIZE_T($sp)
|
|
br $ra
|
|
|
|
.align 16
|
|
.Lnot128:
|
|
llgf $t0,16($inp)
|
|
llgf $t1,20($inp)
|
|
st $t0,16($key)
|
|
st $t1,20($key)
|
|
lghi $t0,192
|
|
cr $bits,$t0
|
|
jne .Lnot192
|
|
|
|
llill $mask,0xff
|
|
lghi $t3,0 # i=0
|
|
lghi $rounds,12
|
|
st $rounds,240($key)
|
|
lghi $rounds,8
|
|
|
|
srlg $i1,$t1,8
|
|
srlg $i2,$t1,16
|
|
srlg $i3,$t1,24
|
|
nr $t1,$mask
|
|
nr $i1,$mask
|
|
nr $i2,$mask
|
|
|
|
.align 16
|
|
.L192_loop:
|
|
la $t1,0($t1,$tbl)
|
|
la $i1,0($i1,$tbl)
|
|
la $i2,0($i2,$tbl)
|
|
la $i3,0($i3,$tbl)
|
|
icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
|
|
icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
|
|
icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
|
|
icm $t1,1,0($i3) # Te4[rk[5]>>24]
|
|
x $t1,256($t3,$tbl) # rcon[i]
|
|
xr $s0,$t1 # rk[6]=rk[0]^...
|
|
xr $s1,$s0 # rk[7]=rk[1]^rk[6]
|
|
xr $s2,$s1 # rk[8]=rk[2]^rk[7]
|
|
xr $s3,$s2 # rk[9]=rk[3]^rk[8]
|
|
|
|
st $s0,24($key)
|
|
st $s1,28($key)
|
|
st $s2,32($key)
|
|
st $s3,36($key)
|
|
brct $rounds,.L192_continue
|
|
lghi $t0,12
|
|
lghi %r2,0
|
|
lm${g} %r4,%r13,4*$SIZE_T($sp)
|
|
br $ra
|
|
|
|
.align 16
|
|
.L192_continue:
|
|
lgr $t1,$s3
|
|
x $t1,16($key) # rk[10]=rk[4]^rk[9]
|
|
st $t1,40($key)
|
|
x $t1,20($key) # rk[11]=rk[5]^rk[10]
|
|
st $t1,44($key)
|
|
|
|
srlg $i1,$t1,8
|
|
srlg $i2,$t1,16
|
|
srlg $i3,$t1,24
|
|
nr $t1,$mask
|
|
nr $i1,$mask
|
|
nr $i2,$mask
|
|
|
|
la $key,24($key) # key+=6
|
|
la $t3,4($t3) # i++
|
|
j .L192_loop
|
|
|
|
.align 16
|
|
.Lnot192:
|
|
llgf $t0,24($inp)
|
|
llgf $t1,28($inp)
|
|
st $t0,24($key)
|
|
st $t1,28($key)
|
|
llill $mask,0xff
|
|
lghi $t3,0 # i=0
|
|
lghi $rounds,14
|
|
st $rounds,240($key)
|
|
lghi $rounds,7
|
|
|
|
srlg $i1,$t1,8
|
|
srlg $i2,$t1,16
|
|
srlg $i3,$t1,24
|
|
nr $t1,$mask
|
|
nr $i1,$mask
|
|
nr $i2,$mask
|
|
|
|
.align 16
|
|
.L256_loop:
|
|
la $t1,0($t1,$tbl)
|
|
la $i1,0($i1,$tbl)
|
|
la $i2,0($i2,$tbl)
|
|
la $i3,0($i3,$tbl)
|
|
icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
|
|
icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
|
|
icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
|
|
icm $t1,1,0($i3) # Te4[rk[7]>>24]
|
|
x $t1,256($t3,$tbl) # rcon[i]
|
|
xr $s0,$t1 # rk[8]=rk[0]^...
|
|
xr $s1,$s0 # rk[9]=rk[1]^rk[8]
|
|
xr $s2,$s1 # rk[10]=rk[2]^rk[9]
|
|
xr $s3,$s2 # rk[11]=rk[3]^rk[10]
|
|
st $s0,32($key)
|
|
st $s1,36($key)
|
|
st $s2,40($key)
|
|
st $s3,44($key)
|
|
brct $rounds,.L256_continue
|
|
lghi $t0,14
|
|
lghi %r2,0
|
|
lm${g} %r4,%r13,4*$SIZE_T($sp)
|
|
br $ra
|
|
|
|
.align 16
|
|
.L256_continue:
|
|
lgr $t1,$s3 # temp=rk[11]
|
|
srlg $i1,$s3,8
|
|
srlg $i2,$s3,16
|
|
srlg $i3,$s3,24
|
|
nr $t1,$mask
|
|
nr $i1,$mask
|
|
nr $i2,$mask
|
|
la $t1,0($t1,$tbl)
|
|
la $i1,0($i1,$tbl)
|
|
la $i2,0($i2,$tbl)
|
|
la $i3,0($i3,$tbl)
|
|
llgc $t1,0($t1) # Te4[rk[11]>>0]
|
|
icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
|
|
icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
|
|
icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
|
|
x $t1,16($key) # rk[12]=rk[4]^...
|
|
st $t1,48($key)
|
|
x $t1,20($key) # rk[13]=rk[5]^rk[12]
|
|
st $t1,52($key)
|
|
x $t1,24($key) # rk[14]=rk[6]^rk[13]
|
|
st $t1,56($key)
|
|
x $t1,28($key) # rk[15]=rk[7]^rk[14]
|
|
st $t1,60($key)
|
|
|
|
srlg $i1,$t1,8
|
|
srlg $i2,$t1,16
|
|
srlg $i3,$t1,24
|
|
nr $t1,$mask
|
|
nr $i1,$mask
|
|
nr $i2,$mask
|
|
|
|
la $key,32($key) # key+=8
|
|
la $t3,4($t3) # i++
|
|
j .L256_loop
|
|
|
|
.Lminus1:
|
|
lghi %r2,-1
|
|
br $ra
|
|
.size AES_set_encrypt_key,.-AES_set_encrypt_key
|
|
|
|
# void AES_set_decrypt_key(const unsigned char *in, int bits,
|
|
# AES_KEY *key) {
|
|
.globl AES_set_decrypt_key
|
|
.type AES_set_decrypt_key,\@function
|
|
.align 16
|
|
AES_set_decrypt_key:
|
|
#st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
|
|
st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
|
|
bras $ra,_s390x_AES_set_encrypt_key
|
|
#l${g} $key,4*$SIZE_T($sp)
|
|
l${g} $ra,14*$SIZE_T($sp)
|
|
ltgr %r2,%r2
|
|
bnzr $ra
|
|
___
|
|
$code.=<<___ if (!$softonly);
|
|
#l $t0,240($key)
|
|
lhi $t1,16
|
|
cr $t0,$t1
|
|
jl .Lgo
|
|
oill $t0,0x80 # set "decrypt" bit
|
|
st $t0,240($key)
|
|
br $ra
|
|
___
|
|
$code.=<<___;
|
|
.align 16
|
|
.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
|
|
la $i1,0($key)
|
|
sllg $i2,$rounds,4
|
|
la $i2,0($i2,$key)
|
|
srl $rounds,1
|
|
lghi $t1,-16
|
|
|
|
.align 16
|
|
.Linv: lmg $s0,$s1,0($i1)
|
|
lmg $s2,$s3,0($i2)
|
|
stmg $s0,$s1,0($i2)
|
|
stmg $s2,$s3,0($i1)
|
|
la $i1,16($i1)
|
|
la $i2,0($t1,$i2)
|
|
brct $rounds,.Linv
|
|
___
|
|
$mask80=$i1;
|
|
$mask1b=$i2;
|
|
$maskfe=$i3;
|
|
$code.=<<___;
|
|
llgf $rounds,240($key)
|
|
aghi $rounds,-1
|
|
sll $rounds,2 # (rounds-1)*4
|
|
llilh $mask80,0x8080
|
|
llilh $mask1b,0x1b1b
|
|
llilh $maskfe,0xfefe
|
|
oill $mask80,0x8080
|
|
oill $mask1b,0x1b1b
|
|
oill $maskfe,0xfefe
|
|
|
|
.align 16
|
|
.Lmix: l $s0,16($key) # tp1
|
|
lr $s1,$s0
|
|
ngr $s1,$mask80
|
|
srlg $t1,$s1,7
|
|
slr $s1,$t1
|
|
nr $s1,$mask1b
|
|
sllg $t1,$s0,1
|
|
nr $t1,$maskfe
|
|
xr $s1,$t1 # tp2
|
|
|
|
lr $s2,$s1
|
|
ngr $s2,$mask80
|
|
srlg $t1,$s2,7
|
|
slr $s2,$t1
|
|
nr $s2,$mask1b
|
|
sllg $t1,$s1,1
|
|
nr $t1,$maskfe
|
|
xr $s2,$t1 # tp4
|
|
|
|
lr $s3,$s2
|
|
ngr $s3,$mask80
|
|
srlg $t1,$s3,7
|
|
slr $s3,$t1
|
|
nr $s3,$mask1b
|
|
sllg $t1,$s2,1
|
|
nr $t1,$maskfe
|
|
xr $s3,$t1 # tp8
|
|
|
|
xr $s1,$s0 # tp2^tp1
|
|
xr $s2,$s0 # tp4^tp1
|
|
rll $s0,$s0,24 # = ROTATE(tp1,8)
|
|
xr $s2,$s3 # ^=tp8
|
|
xr $s0,$s1 # ^=tp2^tp1
|
|
xr $s1,$s3 # tp2^tp1^tp8
|
|
xr $s0,$s2 # ^=tp4^tp1^tp8
|
|
rll $s1,$s1,8
|
|
rll $s2,$s2,16
|
|
xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
|
|
rll $s3,$s3,24
|
|
xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
|
|
xr $s0,$s3 # ^= ROTATE(tp8,8)
|
|
|
|
st $s0,16($key)
|
|
la $key,4($key)
|
|
brct $rounds,.Lmix
|
|
|
|
lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
|
|
lghi %r2,0
|
|
br $ra
|
|
.size AES_set_decrypt_key,.-AES_set_decrypt_key
|
|
___
|
|
|
|
########################################################################
|
|
# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
|
|
# size_t length, const AES_KEY *key,
|
|
# unsigned char *ivec, const int enc)
|
|
{
|
|
my $inp="%r2";
|
|
my $out="%r4"; # length and out are swapped
|
|
my $len="%r3";
|
|
my $key="%r5";
|
|
my $ivp="%r6";
|
|
|
|
$code.=<<___;
|
|
.globl AES_cbc_encrypt
|
|
.type AES_cbc_encrypt,\@function
|
|
.align 16
|
|
AES_cbc_encrypt:
|
|
xgr %r3,%r4 # flip %r3 and %r4, out and len
|
|
xgr %r4,%r3
|
|
xgr %r3,%r4
|
|
___
|
|
$code.=<<___ if (!$softonly);
|
|
lhi %r0,16
|
|
cl %r0,240($key)
|
|
jh .Lcbc_software
|
|
|
|
lg %r0,0($ivp) # copy ivec
|
|
lg %r1,8($ivp)
|
|
stmg %r0,%r1,16($sp)
|
|
lmg %r0,%r1,0($key) # copy key, cover 256 bit
|
|
stmg %r0,%r1,32($sp)
|
|
lmg %r0,%r1,16($key)
|
|
stmg %r0,%r1,48($sp)
|
|
l %r0,240($key) # load kmc code
|
|
lghi $key,15 # res=len%16, len-=res;
|
|
ngr $key,$len
|
|
sl${g}r $len,$key
|
|
la %r1,16($sp) # parameter block - ivec || key
|
|
jz .Lkmc_truncated
|
|
.long 0xb92f0042 # kmc %r4,%r2
|
|
brc 1,.-4 # pay attention to "partial completion"
|
|
ltr $key,$key
|
|
jnz .Lkmc_truncated
|
|
.Lkmc_done:
|
|
lmg %r0,%r1,16($sp) # copy ivec to caller
|
|
stg %r0,0($ivp)
|
|
stg %r1,8($ivp)
|
|
br $ra
|
|
.align 16
|
|
.Lkmc_truncated:
|
|
ahi $key,-1 # it's the way it's encoded in mvc
|
|
tmll %r0,0x80
|
|
jnz .Lkmc_truncated_dec
|
|
lghi %r1,0
|
|
stg %r1,16*$SIZE_T($sp)
|
|
stg %r1,16*$SIZE_T+8($sp)
|
|
bras %r1,1f
|
|
mvc 16*$SIZE_T(1,$sp),0($inp)
|
|
1: ex $key,0(%r1)
|
|
la %r1,16($sp) # restore parameter block
|
|
la $inp,16*$SIZE_T($sp)
|
|
lghi $len,16
|
|
.long 0xb92f0042 # kmc %r4,%r2
|
|
j .Lkmc_done
|
|
.align 16
|
|
.Lkmc_truncated_dec:
|
|
st${g} $out,4*$SIZE_T($sp)
|
|
la $out,16*$SIZE_T($sp)
|
|
lghi $len,16
|
|
.long 0xb92f0042 # kmc %r4,%r2
|
|
l${g} $out,4*$SIZE_T($sp)
|
|
bras %r1,2f
|
|
mvc 0(1,$out),16*$SIZE_T($sp)
|
|
2: ex $key,0(%r1)
|
|
j .Lkmc_done
|
|
.align 16
|
|
.Lcbc_software:
|
|
___
|
|
$code.=<<___;
|
|
stm${g} $key,$ra,5*$SIZE_T($sp)
|
|
lhi %r0,0
|
|
cl %r0,`$stdframe+$SIZE_T-4`($sp)
|
|
je .Lcbc_decrypt
|
|
|
|
larl $tbl,AES_Te
|
|
|
|
llgf $s0,0($ivp)
|
|
llgf $s1,4($ivp)
|
|
llgf $s2,8($ivp)
|
|
llgf $s3,12($ivp)
|
|
|
|
lghi $t0,16
|
|
sl${g}r $len,$t0
|
|
brc 4,.Lcbc_enc_tail # if borrow
|
|
.Lcbc_enc_loop:
|
|
stm${g} $inp,$out,2*$SIZE_T($sp)
|
|
x $s0,0($inp)
|
|
x $s1,4($inp)
|
|
x $s2,8($inp)
|
|
x $s3,12($inp)
|
|
lgr %r4,$key
|
|
|
|
bras $ra,_s390x_AES_encrypt
|
|
|
|
lm${g} $inp,$key,2*$SIZE_T($sp)
|
|
st $s0,0($out)
|
|
st $s1,4($out)
|
|
st $s2,8($out)
|
|
st $s3,12($out)
|
|
|
|
la $inp,16($inp)
|
|
la $out,16($out)
|
|
lghi $t0,16
|
|
lt${g}r $len,$len
|
|
jz .Lcbc_enc_done
|
|
sl${g}r $len,$t0
|
|
brc 4,.Lcbc_enc_tail # if borrow
|
|
j .Lcbc_enc_loop
|
|
.align 16
|
|
.Lcbc_enc_done:
|
|
l${g} $ivp,6*$SIZE_T($sp)
|
|
st $s0,0($ivp)
|
|
st $s1,4($ivp)
|
|
st $s2,8($ivp)
|
|
st $s3,12($ivp)
|
|
|
|
lm${g} %r7,$ra,7*$SIZE_T($sp)
|
|
br $ra
|
|
|
|
.align 16
|
|
.Lcbc_enc_tail:
|
|
aghi $len,15
|
|
lghi $t0,0
|
|
stg $t0,16*$SIZE_T($sp)
|
|
stg $t0,16*$SIZE_T+8($sp)
|
|
bras $t1,3f
|
|
mvc 16*$SIZE_T(1,$sp),0($inp)
|
|
3: ex $len,0($t1)
|
|
lghi $len,0
|
|
la $inp,16*$SIZE_T($sp)
|
|
j .Lcbc_enc_loop
|
|
|
|
.align 16
|
|
.Lcbc_decrypt:
|
|
larl $tbl,AES_Td
|
|
|
|
lg $t0,0($ivp)
|
|
lg $t1,8($ivp)
|
|
stmg $t0,$t1,16*$SIZE_T($sp)
|
|
|
|
.Lcbc_dec_loop:
|
|
stm${g} $inp,$out,2*$SIZE_T($sp)
|
|
llgf $s0,0($inp)
|
|
llgf $s1,4($inp)
|
|
llgf $s2,8($inp)
|
|
llgf $s3,12($inp)
|
|
lgr %r4,$key
|
|
|
|
bras $ra,_s390x_AES_decrypt
|
|
|
|
lm${g} $inp,$key,2*$SIZE_T($sp)
|
|
sllg $s0,$s0,32
|
|
sllg $s2,$s2,32
|
|
lr $s0,$s1
|
|
lr $s2,$s3
|
|
|
|
lg $t0,0($inp)
|
|
lg $t1,8($inp)
|
|
xg $s0,16*$SIZE_T($sp)
|
|
xg $s2,16*$SIZE_T+8($sp)
|
|
lghi $s1,16
|
|
sl${g}r $len,$s1
|
|
brc 4,.Lcbc_dec_tail # if borrow
|
|
brc 2,.Lcbc_dec_done # if zero
|
|
stg $s0,0($out)
|
|
stg $s2,8($out)
|
|
stmg $t0,$t1,16*$SIZE_T($sp)
|
|
|
|
la $inp,16($inp)
|
|
la $out,16($out)
|
|
j .Lcbc_dec_loop
|
|
|
|
.Lcbc_dec_done:
|
|
stg $s0,0($out)
|
|
stg $s2,8($out)
|
|
.Lcbc_dec_exit:
|
|
lm${g} %r6,$ra,6*$SIZE_T($sp)
|
|
stmg $t0,$t1,0($ivp)
|
|
|
|
br $ra
|
|
|
|
.align 16
|
|
.Lcbc_dec_tail:
|
|
aghi $len,15
|
|
stg $s0,16*$SIZE_T($sp)
|
|
stg $s2,16*$SIZE_T+8($sp)
|
|
bras $s1,4f
|
|
mvc 0(1,$out),16*$SIZE_T($sp)
|
|
4: ex $len,0($s1)
|
|
j .Lcbc_dec_exit
|
|
.size AES_cbc_encrypt,.-AES_cbc_encrypt
|
|
___
|
|
}
|
|
########################################################################
|
|
# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
|
|
# size_t blocks, const AES_KEY *key,
|
|
# const unsigned char *ivec)
|
|
{
|
|
my $inp="%r2";
|
|
my $out="%r4"; # blocks and out are swapped
|
|
my $len="%r3";
|
|
my $key="%r5"; my $iv0="%r5";
|
|
my $ivp="%r6";
|
|
my $fp ="%r7";
|
|
|
|
$code.=<<___;
|
|
.globl AES_ctr32_encrypt
|
|
.type AES_ctr32_encrypt,\@function
|
|
.align 16
|
|
AES_ctr32_encrypt:
|
|
xgr %r3,%r4 # flip %r3 and %r4, $out and $len
|
|
xgr %r4,%r3
|
|
xgr %r3,%r4
|
|
llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
|
|
___
|
|
$code.=<<___ if (!$softonly);
|
|
l %r0,240($key)
|
|
lhi %r1,16
|
|
clr %r0,%r1
|
|
jl .Lctr32_software
|
|
|
|
stm${g} %r6,$s3,6*$SIZE_T($sp)
|
|
|
|
slgr $out,$inp
|
|
la %r1,0($key) # %r1 is permanent copy of $key
|
|
lg $iv0,0($ivp) # load ivec
|
|
lg $ivp,8($ivp)
|
|
|
|
# prepare and allocate stack frame at the top of 4K page
|
|
# with 1K reserved for eventual signal handling
|
|
lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
|
|
lghi $s1,-4096
|
|
algr $s0,$sp
|
|
lgr $fp,$sp
|
|
ngr $s0,$s1 # align at page boundary
|
|
slgr $fp,$s0 # total buffer size
|
|
lgr $s2,$sp
|
|
lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
|
|
slgr $fp,$s1 # deduct reservation to get usable buffer size
|
|
# buffer size is at lest 256 and at most 3072+256-16
|
|
|
|
la $sp,1024($s0) # alloca
|
|
srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
|
|
st${g} $s2,0($sp) # back-chain
|
|
st${g} $fp,$SIZE_T($sp)
|
|
|
|
slgr $len,$fp
|
|
brc 1,.Lctr32_hw_switch # not zero, no borrow
|
|
algr $fp,$len # input is shorter than allocated buffer
|
|
lghi $len,0
|
|
st${g} $fp,$SIZE_T($sp)
|
|
|
|
.Lctr32_hw_switch:
|
|
___
|
|
$code.=<<___ if (!$softonly && 0);# kmctr code was measured to be ~12% slower
|
|
llgfr $s0,%r0
|
|
lgr $s1,%r1
|
|
larl %r1,OPENSSL_s390xcap_P
|
|
llihh %r0,0x8000 # check if kmctr supports the function code
|
|
srlg %r0,%r0,0($s0)
|
|
ng %r0,S390X_KMCTR(%r1) # check kmctr capability vector
|
|
lgr %r0,$s0
|
|
lgr %r1,$s1
|
|
jz .Lctr32_km_loop
|
|
|
|
####### kmctr code
|
|
algr $out,$inp # restore $out
|
|
lgr $s1,$len # $s1 undertakes $len
|
|
j .Lctr32_kmctr_loop
|
|
.align 16
|
|
.Lctr32_kmctr_loop:
|
|
la $s2,16($sp)
|
|
lgr $s3,$fp
|
|
.Lctr32_kmctr_prepare:
|
|
stg $iv0,0($s2)
|
|
stg $ivp,8($s2)
|
|
la $s2,16($s2)
|
|
ahi $ivp,1 # 32-bit increment, preserves upper half
|
|
brct $s3,.Lctr32_kmctr_prepare
|
|
|
|
#la $inp,0($inp) # inp
|
|
sllg $len,$fp,4 # len
|
|
#la $out,0($out) # out
|
|
la $s2,16($sp) # iv
|
|
.long 0xb92da042 # kmctr $out,$s2,$inp
|
|
brc 1,.-4 # pay attention to "partial completion"
|
|
|
|
slgr $s1,$fp
|
|
brc 1,.Lctr32_kmctr_loop # not zero, no borrow
|
|
algr $fp,$s1
|
|
lghi $s1,0
|
|
brc 4+1,.Lctr32_kmctr_loop # not zero
|
|
|
|
l${g} $sp,0($sp)
|
|
lm${g} %r6,$s3,6*$SIZE_T($sp)
|
|
br $ra
|
|
.align 16
|
|
___
|
|
$code.=<<___ if (!$softonly);
|
|
.Lctr32_km_loop:
|
|
la $s2,16($sp)
|
|
lgr $s3,$fp
|
|
.Lctr32_km_prepare:
|
|
stg $iv0,0($s2)
|
|
stg $ivp,8($s2)
|
|
la $s2,16($s2)
|
|
ahi $ivp,1 # 32-bit increment, preserves upper half
|
|
brct $s3,.Lctr32_km_prepare
|
|
|
|
la $s0,16($sp) # inp
|
|
sllg $s1,$fp,4 # len
|
|
la $s2,16($sp) # out
|
|
.long 0xb92e00a8 # km %r10,%r8
|
|
brc 1,.-4 # pay attention to "partial completion"
|
|
|
|
la $s2,16($sp)
|
|
lgr $s3,$fp
|
|
slgr $s2,$inp
|
|
.Lctr32_km_xor:
|
|
lg $s0,0($inp)
|
|
lg $s1,8($inp)
|
|
xg $s0,0($s2,$inp)
|
|
xg $s1,8($s2,$inp)
|
|
stg $s0,0($out,$inp)
|
|
stg $s1,8($out,$inp)
|
|
la $inp,16($inp)
|
|
brct $s3,.Lctr32_km_xor
|
|
|
|
slgr $len,$fp
|
|
brc 1,.Lctr32_km_loop # not zero, no borrow
|
|
algr $fp,$len
|
|
lghi $len,0
|
|
brc 4+1,.Lctr32_km_loop # not zero
|
|
|
|
l${g} $s0,0($sp)
|
|
l${g} $s1,$SIZE_T($sp)
|
|
la $s2,16($sp)
|
|
.Lctr32_km_zap:
|
|
stg $s0,0($s2)
|
|
stg $s0,8($s2)
|
|
la $s2,16($s2)
|
|
brct $s1,.Lctr32_km_zap
|
|
|
|
la $sp,0($s0)
|
|
lm${g} %r6,$s3,6*$SIZE_T($sp)
|
|
br $ra
|
|
.align 16
|
|
.Lctr32_software:
|
|
___
|
|
$code.=<<___;
|
|
stm${g} $key,$ra,5*$SIZE_T($sp)
|
|
sl${g}r $inp,$out
|
|
larl $tbl,AES_Te
|
|
llgf $t1,12($ivp)
|
|
|
|
.Lctr32_loop:
|
|
stm${g} $inp,$out,2*$SIZE_T($sp)
|
|
llgf $s0,0($ivp)
|
|
llgf $s1,4($ivp)
|
|
llgf $s2,8($ivp)
|
|
lgr $s3,$t1
|
|
st $t1,16*$SIZE_T($sp)
|
|
lgr %r4,$key
|
|
|
|
bras $ra,_s390x_AES_encrypt
|
|
|
|
lm${g} $inp,$ivp,2*$SIZE_T($sp)
|
|
llgf $t1,16*$SIZE_T($sp)
|
|
x $s0,0($inp,$out)
|
|
x $s1,4($inp,$out)
|
|
x $s2,8($inp,$out)
|
|
x $s3,12($inp,$out)
|
|
stm $s0,$s3,0($out)
|
|
|
|
la $out,16($out)
|
|
ahi $t1,1 # 32-bit increment
|
|
brct $len,.Lctr32_loop
|
|
|
|
lm${g} %r6,$ra,6*$SIZE_T($sp)
|
|
br $ra
|
|
.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
|
|
___
|
|
}
|
|
|
|
########################################################################
|
|
# void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
|
|
# size_t len, const AES_KEY *key1, const AES_KEY *key2,
|
|
# const unsigned char iv[16]);
|
|
#
|
|
{
|
|
my $inp="%r2";
|
|
my $out="%r4"; # len and out are swapped
|
|
my $len="%r3";
|
|
my $key1="%r5"; # $i1
|
|
my $key2="%r6"; # $i2
|
|
my $fp="%r7"; # $i3
|
|
my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
|
|
|
|
$code.=<<___;
|
|
.type _s390x_xts_km,\@function
|
|
.align 16
|
|
_s390x_xts_km:
|
|
___
|
|
$code.=<<___ if(1);
|
|
llgfr $s0,%r0 # put aside the function code
|
|
lghi $s1,0x7f
|
|
nr $s1,%r0
|
|
larl %r1,OPENSSL_s390xcap_P
|
|
llihh %r0,0x8000
|
|
srlg %r0,%r0,32($s1) # check for 32+function code
|
|
ng %r0,S390X_KM(%r1) # check km capability vector
|
|
lgr %r0,$s0 # restore the function code
|
|
la %r1,0($key1) # restore $key1
|
|
jz .Lxts_km_vanilla
|
|
|
|
lmg $i2,$i3,$tweak($sp) # put aside the tweak value
|
|
algr $out,$inp
|
|
|
|
oill %r0,32 # switch to xts function code
|
|
aghi $s1,-18 #
|
|
sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
|
|
la %r1,$tweak-16($sp)
|
|
slgr %r1,$s1 # parameter block position
|
|
lmg $s0,$s3,0($key1) # load 256 bits of key material,
|
|
stmg $s0,$s3,0(%r1) # and copy it to parameter block.
|
|
# yes, it contains junk and overlaps
|
|
# with the tweak in 128-bit case.
|
|
# it's done to avoid conditional
|
|
# branch.
|
|
stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
|
|
|
|
.long 0xb92e0042 # km %r4,%r2
|
|
brc 1,.-4 # pay attention to "partial completion"
|
|
|
|
lrvg $s0,$tweak+0($sp) # load the last tweak
|
|
lrvg $s1,$tweak+8($sp)
|
|
stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
|
|
|
|
nill %r0,0xffdf # switch back to original function code
|
|
la %r1,0($key1) # restore pointer to $key1
|
|
slgr $out,$inp
|
|
|
|
llgc $len,2*$SIZE_T-1($sp)
|
|
nill $len,0x0f # $len%=16
|
|
br $ra
|
|
|
|
.align 16
|
|
.Lxts_km_vanilla:
|
|
___
|
|
$code.=<<___;
|
|
# prepare and allocate stack frame at the top of 4K page
|
|
# with 1K reserved for eventual signal handling
|
|
lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
|
|
lghi $s1,-4096
|
|
algr $s0,$sp
|
|
lgr $fp,$sp
|
|
ngr $s0,$s1 # align at page boundary
|
|
slgr $fp,$s0 # total buffer size
|
|
lgr $s2,$sp
|
|
lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
|
|
slgr $fp,$s1 # deduct reservation to get usable buffer size
|
|
# buffer size is at lest 256 and at most 3072+256-16
|
|
|
|
la $sp,1024($s0) # alloca
|
|
nill $fp,0xfff0 # round to 16*n
|
|
st${g} $s2,0($sp) # back-chain
|
|
nill $len,0xfff0 # redundant
|
|
st${g} $fp,$SIZE_T($sp)
|
|
|
|
slgr $len,$fp
|
|
brc 1,.Lxts_km_go # not zero, no borrow
|
|
algr $fp,$len # input is shorter than allocated buffer
|
|
lghi $len,0
|
|
st${g} $fp,$SIZE_T($sp)
|
|
|
|
.Lxts_km_go:
|
|
lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
|
|
lrvg $s1,$tweak+8($s2)
|
|
|
|
la $s2,16($sp) # vector of ascending tweak values
|
|
slgr $s2,$inp
|
|
srlg $s3,$fp,4
|
|
j .Lxts_km_start
|
|
|
|
.Lxts_km_loop:
|
|
la $s2,16($sp)
|
|
slgr $s2,$inp
|
|
srlg $s3,$fp,4
|
|
.Lxts_km_prepare:
|
|
lghi $i1,0x87
|
|
srag $i2,$s1,63 # broadcast upper bit
|
|
ngr $i1,$i2 # rem
|
|
algr $s0,$s0
|
|
alcgr $s1,$s1
|
|
xgr $s0,$i1
|
|
.Lxts_km_start:
|
|
lrvgr $i1,$s0 # flip byte order
|
|
lrvgr $i2,$s1
|
|
stg $i1,0($s2,$inp)
|
|
stg $i2,8($s2,$inp)
|
|
xg $i1,0($inp)
|
|
xg $i2,8($inp)
|
|
stg $i1,0($out,$inp)
|
|
stg $i2,8($out,$inp)
|
|
la $inp,16($inp)
|
|
brct $s3,.Lxts_km_prepare
|
|
|
|
slgr $inp,$fp # rewind $inp
|
|
la $s2,0($out,$inp)
|
|
lgr $s3,$fp
|
|
.long 0xb92e00aa # km $s2,$s2
|
|
brc 1,.-4 # pay attention to "partial completion"
|
|
|
|
la $s2,16($sp)
|
|
slgr $s2,$inp
|
|
srlg $s3,$fp,4
|
|
.Lxts_km_xor:
|
|
lg $i1,0($out,$inp)
|
|
lg $i2,8($out,$inp)
|
|
xg $i1,0($s2,$inp)
|
|
xg $i2,8($s2,$inp)
|
|
stg $i1,0($out,$inp)
|
|
stg $i2,8($out,$inp)
|
|
la $inp,16($inp)
|
|
brct $s3,.Lxts_km_xor
|
|
|
|
slgr $len,$fp
|
|
brc 1,.Lxts_km_loop # not zero, no borrow
|
|
algr $fp,$len
|
|
lghi $len,0
|
|
brc 4+1,.Lxts_km_loop # not zero
|
|
|
|
l${g} $i1,0($sp) # back-chain
|
|
llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
|
|
la $i2,16($sp)
|
|
srlg $fp,$fp,4
|
|
.Lxts_km_zap:
|
|
stg $i1,0($i2)
|
|
stg $i1,8($i2)
|
|
la $i2,16($i2)
|
|
brct $fp,.Lxts_km_zap
|
|
|
|
la $sp,0($i1)
|
|
llgc $len,2*$SIZE_T-1($i1)
|
|
nill $len,0x0f # $len%=16
|
|
bzr $ra
|
|
|
|
# generate one more tweak...
|
|
lghi $i1,0x87
|
|
srag $i2,$s1,63 # broadcast upper bit
|
|
ngr $i1,$i2 # rem
|
|
algr $s0,$s0
|
|
alcgr $s1,$s1
|
|
xgr $s0,$i1
|
|
|
|
ltr $len,$len # clear zero flag
|
|
br $ra
|
|
.size _s390x_xts_km,.-_s390x_xts_km
|
|
|
|
.globl AES_xts_encrypt
|
|
.type AES_xts_encrypt,\@function
|
|
.align 16
|
|
AES_xts_encrypt:
|
|
xgr %r3,%r4 # flip %r3 and %r4, $out and $len
|
|
xgr %r4,%r3
|
|
xgr %r3,%r4
|
|
___
|
|
$code.=<<___ if ($SIZE_T==4);
|
|
llgfr $len,$len
|
|
___
|
|
$code.=<<___;
|
|
st${g} $len,1*$SIZE_T($sp) # save copy of $len
|
|
srag $len,$len,4 # formally wrong, because it expands
|
|
# sign byte, but who can afford asking
|
|
# to process more than 2^63-1 bytes?
|
|
# I use it, because it sets condition
|
|
# code...
|
|
bcr 8,$ra # abort if zero (i.e. less than 16)
|
|
___
|
|
$code.=<<___ if (!$softonly);
|
|
llgf %r0,240($key2)
|
|
lhi %r1,16
|
|
clr %r0,%r1
|
|
jl .Lxts_enc_software
|
|
|
|
st${g} $ra,5*$SIZE_T($sp)
|
|
stm${g} %r6,$s3,6*$SIZE_T($sp)
|
|
|
|
sllg $len,$len,4 # $len&=~15
|
|
slgr $out,$inp
|
|
|
|
# generate the tweak value
|
|
l${g} $s3,$stdframe($sp) # pointer to iv
|
|
la $s2,$tweak($sp)
|
|
lmg $s0,$s1,0($s3)
|
|
lghi $s3,16
|
|
stmg $s0,$s1,0($s2)
|
|
la %r1,0($key2) # $key2 is not needed anymore
|
|
.long 0xb92e00aa # km $s2,$s2, generate the tweak
|
|
brc 1,.-4 # can this happen?
|
|
|
|
l %r0,240($key1)
|
|
la %r1,0($key1) # $key1 is not needed anymore
|
|
bras $ra,_s390x_xts_km
|
|
jz .Lxts_enc_km_done
|
|
|
|
aghi $inp,-16 # take one step back
|
|
la $i3,0($out,$inp) # put aside real $out
|
|
.Lxts_enc_km_steal:
|
|
llgc $i1,16($inp)
|
|
llgc $i2,0($out,$inp)
|
|
stc $i1,0($out,$inp)
|
|
stc $i2,16($out,$inp)
|
|
la $inp,1($inp)
|
|
brct $len,.Lxts_enc_km_steal
|
|
|
|
la $s2,0($i3)
|
|
lghi $s3,16
|
|
lrvgr $i1,$s0 # flip byte order
|
|
lrvgr $i2,$s1
|
|
xg $i1,0($s2)
|
|
xg $i2,8($s2)
|
|
stg $i1,0($s2)
|
|
stg $i2,8($s2)
|
|
.long 0xb92e00aa # km $s2,$s2
|
|
brc 1,.-4 # can this happen?
|
|
lrvgr $i1,$s0 # flip byte order
|
|
lrvgr $i2,$s1
|
|
xg $i1,0($i3)
|
|
xg $i2,8($i3)
|
|
stg $i1,0($i3)
|
|
stg $i2,8($i3)
|
|
|
|
.Lxts_enc_km_done:
|
|
stg $sp,$tweak+0($sp) # wipe tweak
|
|
stg $sp,$tweak+8($sp)
|
|
l${g} $ra,5*$SIZE_T($sp)
|
|
lm${g} %r6,$s3,6*$SIZE_T($sp)
|
|
br $ra
|
|
.align 16
|
|
.Lxts_enc_software:
|
|
___
|
|
$code.=<<___;
|
|
stm${g} %r6,$ra,6*$SIZE_T($sp)
|
|
|
|
slgr $out,$inp
|
|
|
|
l${g} $s3,$stdframe($sp) # ivp
|
|
llgf $s0,0($s3) # load iv
|
|
llgf $s1,4($s3)
|
|
llgf $s2,8($s3)
|
|
llgf $s3,12($s3)
|
|
stm${g} %r2,%r5,2*$SIZE_T($sp)
|
|
la $key,0($key2)
|
|
larl $tbl,AES_Te
|
|
bras $ra,_s390x_AES_encrypt # generate the tweak
|
|
lm${g} %r2,%r5,2*$SIZE_T($sp)
|
|
stm $s0,$s3,$tweak($sp) # save the tweak
|
|
j .Lxts_enc_enter
|
|
|
|
.align 16
|
|
.Lxts_enc_loop:
|
|
lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
|
|
lrvg $s3,$tweak+8($sp)
|
|
lghi %r1,0x87
|
|
srag %r0,$s3,63 # broadcast upper bit
|
|
ngr %r1,%r0 # rem
|
|
algr $s1,$s1
|
|
alcgr $s3,$s3
|
|
xgr $s1,%r1
|
|
lrvgr $s1,$s1 # flip byte order
|
|
lrvgr $s3,$s3
|
|
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
|
stg $s1,$tweak+0($sp) # save the tweak
|
|
llgfr $s1,$s1
|
|
srlg $s2,$s3,32
|
|
stg $s3,$tweak+8($sp)
|
|
llgfr $s3,$s3
|
|
la $inp,16($inp) # $inp+=16
|
|
.Lxts_enc_enter:
|
|
x $s0,0($inp) # ^=*($inp)
|
|
x $s1,4($inp)
|
|
x $s2,8($inp)
|
|
x $s3,12($inp)
|
|
stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
|
|
la $key,0($key1)
|
|
bras $ra,_s390x_AES_encrypt
|
|
lm${g} %r2,%r5,2*$SIZE_T($sp)
|
|
x $s0,$tweak+0($sp) # ^=tweak
|
|
x $s1,$tweak+4($sp)
|
|
x $s2,$tweak+8($sp)
|
|
x $s3,$tweak+12($sp)
|
|
st $s0,0($out,$inp)
|
|
st $s1,4($out,$inp)
|
|
st $s2,8($out,$inp)
|
|
st $s3,12($out,$inp)
|
|
brct${g} $len,.Lxts_enc_loop
|
|
|
|
llgc $len,`2*$SIZE_T-1`($sp)
|
|
nill $len,0x0f # $len%16
|
|
jz .Lxts_enc_done
|
|
|
|
la $i3,0($inp,$out) # put aside real $out
|
|
.Lxts_enc_steal:
|
|
llgc %r0,16($inp)
|
|
llgc %r1,0($out,$inp)
|
|
stc %r0,0($out,$inp)
|
|
stc %r1,16($out,$inp)
|
|
la $inp,1($inp)
|
|
brct $len,.Lxts_enc_steal
|
|
la $out,0($i3) # restore real $out
|
|
|
|
# generate last tweak...
|
|
lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
|
|
lrvg $s3,$tweak+8($sp)
|
|
lghi %r1,0x87
|
|
srag %r0,$s3,63 # broadcast upper bit
|
|
ngr %r1,%r0 # rem
|
|
algr $s1,$s1
|
|
alcgr $s3,$s3
|
|
xgr $s1,%r1
|
|
lrvgr $s1,$s1 # flip byte order
|
|
lrvgr $s3,$s3
|
|
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
|
stg $s1,$tweak+0($sp) # save the tweak
|
|
llgfr $s1,$s1
|
|
srlg $s2,$s3,32
|
|
stg $s3,$tweak+8($sp)
|
|
llgfr $s3,$s3
|
|
|
|
x $s0,0($out) # ^=*(inp)|stolen cipther-text
|
|
x $s1,4($out)
|
|
x $s2,8($out)
|
|
x $s3,12($out)
|
|
st${g} $out,4*$SIZE_T($sp)
|
|
la $key,0($key1)
|
|
bras $ra,_s390x_AES_encrypt
|
|
l${g} $out,4*$SIZE_T($sp)
|
|
x $s0,`$tweak+0`($sp) # ^=tweak
|
|
x $s1,`$tweak+4`($sp)
|
|
x $s2,`$tweak+8`($sp)
|
|
x $s3,`$tweak+12`($sp)
|
|
st $s0,0($out)
|
|
st $s1,4($out)
|
|
st $s2,8($out)
|
|
st $s3,12($out)
|
|
|
|
.Lxts_enc_done:
|
|
stg $sp,$tweak+0($sp) # wipe tweak
|
|
stg $sp,$twesk+8($sp)
|
|
lm${g} %r6,$ra,6*$SIZE_T($sp)
|
|
br $ra
|
|
.size AES_xts_encrypt,.-AES_xts_encrypt
|
|
___
|
|
# void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
|
|
# size_t len, const AES_KEY *key1, const AES_KEY *key2,
|
|
# const unsigned char iv[16]);
|
|
#
|
|
$code.=<<___;
|
|
.globl AES_xts_decrypt
|
|
.type AES_xts_decrypt,\@function
|
|
.align 16
|
|
AES_xts_decrypt:
|
|
xgr %r3,%r4 # flip %r3 and %r4, $out and $len
|
|
xgr %r4,%r3
|
|
xgr %r3,%r4
|
|
___
|
|
$code.=<<___ if ($SIZE_T==4);
|
|
llgfr $len,$len
|
|
___
|
|
$code.=<<___;
|
|
st${g} $len,1*$SIZE_T($sp) # save copy of $len
|
|
aghi $len,-16
|
|
bcr 4,$ra # abort if less than zero. formally
|
|
# wrong, because $len is unsigned,
|
|
# but who can afford asking to
|
|
# process more than 2^63-1 bytes?
|
|
tmll $len,0x0f
|
|
jnz .Lxts_dec_proceed
|
|
aghi $len,16
|
|
.Lxts_dec_proceed:
|
|
___
|
|
$code.=<<___ if (!$softonly);
|
|
llgf %r0,240($key2)
|
|
lhi %r1,16
|
|
clr %r0,%r1
|
|
jl .Lxts_dec_software
|
|
|
|
st${g} $ra,5*$SIZE_T($sp)
|
|
stm${g} %r6,$s3,6*$SIZE_T($sp)
|
|
|
|
nill $len,0xfff0 # $len&=~15
|
|
slgr $out,$inp
|
|
|
|
# generate the tweak value
|
|
l${g} $s3,$stdframe($sp) # pointer to iv
|
|
la $s2,$tweak($sp)
|
|
lmg $s0,$s1,0($s3)
|
|
lghi $s3,16
|
|
stmg $s0,$s1,0($s2)
|
|
la %r1,0($key2) # $key2 is not needed past this point
|
|
.long 0xb92e00aa # km $s2,$s2, generate the tweak
|
|
brc 1,.-4 # can this happen?
|
|
|
|
l %r0,240($key1)
|
|
la %r1,0($key1) # $key1 is not needed anymore
|
|
|
|
ltgr $len,$len
|
|
jz .Lxts_dec_km_short
|
|
bras $ra,_s390x_xts_km
|
|
jz .Lxts_dec_km_done
|
|
|
|
lrvgr $s2,$s0 # make copy in reverse byte order
|
|
lrvgr $s3,$s1
|
|
j .Lxts_dec_km_2ndtweak
|
|
|
|
.Lxts_dec_km_short:
|
|
llgc $len,`2*$SIZE_T-1`($sp)
|
|
nill $len,0x0f # $len%=16
|
|
lrvg $s0,$tweak+0($sp) # load the tweak
|
|
lrvg $s1,$tweak+8($sp)
|
|
lrvgr $s2,$s0 # make copy in reverse byte order
|
|
lrvgr $s3,$s1
|
|
|
|
.Lxts_dec_km_2ndtweak:
|
|
lghi $i1,0x87
|
|
srag $i2,$s1,63 # broadcast upper bit
|
|
ngr $i1,$i2 # rem
|
|
algr $s0,$s0
|
|
alcgr $s1,$s1
|
|
xgr $s0,$i1
|
|
lrvgr $i1,$s0 # flip byte order
|
|
lrvgr $i2,$s1
|
|
|
|
xg $i1,0($inp)
|
|
xg $i2,8($inp)
|
|
stg $i1,0($out,$inp)
|
|
stg $i2,8($out,$inp)
|
|
la $i2,0($out,$inp)
|
|
lghi $i3,16
|
|
.long 0xb92e0066 # km $i2,$i2
|
|
brc 1,.-4 # can this happen?
|
|
lrvgr $i1,$s0
|
|
lrvgr $i2,$s1
|
|
xg $i1,0($out,$inp)
|
|
xg $i2,8($out,$inp)
|
|
stg $i1,0($out,$inp)
|
|
stg $i2,8($out,$inp)
|
|
|
|
la $i3,0($out,$inp) # put aside real $out
|
|
.Lxts_dec_km_steal:
|
|
llgc $i1,16($inp)
|
|
llgc $i2,0($out,$inp)
|
|
stc $i1,0($out,$inp)
|
|
stc $i2,16($out,$inp)
|
|
la $inp,1($inp)
|
|
brct $len,.Lxts_dec_km_steal
|
|
|
|
lgr $s0,$s2
|
|
lgr $s1,$s3
|
|
xg $s0,0($i3)
|
|
xg $s1,8($i3)
|
|
stg $s0,0($i3)
|
|
stg $s1,8($i3)
|
|
la $s0,0($i3)
|
|
lghi $s1,16
|
|
.long 0xb92e0088 # km $s0,$s0
|
|
brc 1,.-4 # can this happen?
|
|
xg $s2,0($i3)
|
|
xg $s3,8($i3)
|
|
stg $s2,0($i3)
|
|
stg $s3,8($i3)
|
|
.Lxts_dec_km_done:
|
|
stg $sp,$tweak+0($sp) # wipe tweak
|
|
stg $sp,$tweak+8($sp)
|
|
l${g} $ra,5*$SIZE_T($sp)
|
|
lm${g} %r6,$s3,6*$SIZE_T($sp)
|
|
br $ra
|
|
.align 16
|
|
.Lxts_dec_software:
|
|
___
|
|
$code.=<<___;
|
|
stm${g} %r6,$ra,6*$SIZE_T($sp)
|
|
|
|
srlg $len,$len,4
|
|
slgr $out,$inp
|
|
|
|
l${g} $s3,$stdframe($sp) # ivp
|
|
llgf $s0,0($s3) # load iv
|
|
llgf $s1,4($s3)
|
|
llgf $s2,8($s3)
|
|
llgf $s3,12($s3)
|
|
stm${g} %r2,%r5,2*$SIZE_T($sp)
|
|
la $key,0($key2)
|
|
larl $tbl,AES_Te
|
|
bras $ra,_s390x_AES_encrypt # generate the tweak
|
|
lm${g} %r2,%r5,2*$SIZE_T($sp)
|
|
larl $tbl,AES_Td
|
|
lt${g}r $len,$len
|
|
stm $s0,$s3,$tweak($sp) # save the tweak
|
|
jz .Lxts_dec_short
|
|
j .Lxts_dec_enter
|
|
|
|
.align 16
|
|
.Lxts_dec_loop:
|
|
lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
|
|
lrvg $s3,$tweak+8($sp)
|
|
lghi %r1,0x87
|
|
srag %r0,$s3,63 # broadcast upper bit
|
|
ngr %r1,%r0 # rem
|
|
algr $s1,$s1
|
|
alcgr $s3,$s3
|
|
xgr $s1,%r1
|
|
lrvgr $s1,$s1 # flip byte order
|
|
lrvgr $s3,$s3
|
|
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
|
stg $s1,$tweak+0($sp) # save the tweak
|
|
llgfr $s1,$s1
|
|
srlg $s2,$s3,32
|
|
stg $s3,$tweak+8($sp)
|
|
llgfr $s3,$s3
|
|
.Lxts_dec_enter:
|
|
x $s0,0($inp) # tweak^=*(inp)
|
|
x $s1,4($inp)
|
|
x $s2,8($inp)
|
|
x $s3,12($inp)
|
|
stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
|
|
la $key,0($key1)
|
|
bras $ra,_s390x_AES_decrypt
|
|
lm${g} %r2,%r5,2*$SIZE_T($sp)
|
|
x $s0,$tweak+0($sp) # ^=tweak
|
|
x $s1,$tweak+4($sp)
|
|
x $s2,$tweak+8($sp)
|
|
x $s3,$tweak+12($sp)
|
|
st $s0,0($out,$inp)
|
|
st $s1,4($out,$inp)
|
|
st $s2,8($out,$inp)
|
|
st $s3,12($out,$inp)
|
|
la $inp,16($inp)
|
|
brct${g} $len,.Lxts_dec_loop
|
|
|
|
llgc $len,`2*$SIZE_T-1`($sp)
|
|
nill $len,0x0f # $len%16
|
|
jz .Lxts_dec_done
|
|
|
|
# generate pair of tweaks...
|
|
lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
|
|
lrvg $s3,$tweak+8($sp)
|
|
lghi %r1,0x87
|
|
srag %r0,$s3,63 # broadcast upper bit
|
|
ngr %r1,%r0 # rem
|
|
algr $s1,$s1
|
|
alcgr $s3,$s3
|
|
xgr $s1,%r1
|
|
lrvgr $i2,$s1 # flip byte order
|
|
lrvgr $i3,$s3
|
|
stmg $i2,$i3,$tweak($sp) # save the 1st tweak
|
|
j .Lxts_dec_2ndtweak
|
|
|
|
.align 16
|
|
.Lxts_dec_short:
|
|
llgc $len,`2*$SIZE_T-1`($sp)
|
|
nill $len,0x0f # $len%16
|
|
lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
|
|
lrvg $s3,$tweak+8($sp)
|
|
.Lxts_dec_2ndtweak:
|
|
lghi %r1,0x87
|
|
srag %r0,$s3,63 # broadcast upper bit
|
|
ngr %r1,%r0 # rem
|
|
algr $s1,$s1
|
|
alcgr $s3,$s3
|
|
xgr $s1,%r1
|
|
lrvgr $s1,$s1 # flip byte order
|
|
lrvgr $s3,$s3
|
|
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
|
stg $s1,$tweak-16+0($sp) # save the 2nd tweak
|
|
llgfr $s1,$s1
|
|
srlg $s2,$s3,32
|
|
stg $s3,$tweak-16+8($sp)
|
|
llgfr $s3,$s3
|
|
|
|
x $s0,0($inp) # tweak_the_2nd^=*(inp)
|
|
x $s1,4($inp)
|
|
x $s2,8($inp)
|
|
x $s3,12($inp)
|
|
stm${g} %r2,%r3,2*$SIZE_T($sp)
|
|
la $key,0($key1)
|
|
bras $ra,_s390x_AES_decrypt
|
|
lm${g} %r2,%r5,2*$SIZE_T($sp)
|
|
x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
|
|
x $s1,$tweak-16+4($sp)
|
|
x $s2,$tweak-16+8($sp)
|
|
x $s3,$tweak-16+12($sp)
|
|
st $s0,0($out,$inp)
|
|
st $s1,4($out,$inp)
|
|
st $s2,8($out,$inp)
|
|
st $s3,12($out,$inp)
|
|
|
|
la $i3,0($out,$inp) # put aside real $out
|
|
.Lxts_dec_steal:
|
|
llgc %r0,16($inp)
|
|
llgc %r1,0($out,$inp)
|
|
stc %r0,0($out,$inp)
|
|
stc %r1,16($out,$inp)
|
|
la $inp,1($inp)
|
|
brct $len,.Lxts_dec_steal
|
|
la $out,0($i3) # restore real $out
|
|
|
|
lm $s0,$s3,$tweak($sp) # load the 1st tweak
|
|
x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
|
|
x $s1,4($out)
|
|
x $s2,8($out)
|
|
x $s3,12($out)
|
|
st${g} $out,4*$SIZE_T($sp)
|
|
la $key,0($key1)
|
|
bras $ra,_s390x_AES_decrypt
|
|
l${g} $out,4*$SIZE_T($sp)
|
|
x $s0,$tweak+0($sp) # ^=tweak
|
|
x $s1,$tweak+4($sp)
|
|
x $s2,$tweak+8($sp)
|
|
x $s3,$tweak+12($sp)
|
|
st $s0,0($out)
|
|
st $s1,4($out)
|
|
st $s2,8($out)
|
|
st $s3,12($out)
|
|
stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
|
|
stg $sp,$tweak-16+8($sp)
|
|
.Lxts_dec_done:
|
|
stg $sp,$tweak+0($sp) # wipe tweak
|
|
stg $sp,$twesk+8($sp)
|
|
lm${g} %r6,$ra,6*$SIZE_T($sp)
|
|
br $ra
|
|
.size AES_xts_decrypt,.-AES_xts_decrypt
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
|
___
|
|
|
|
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
print $code;
|
|
close STDOUT; # force flush
|