openssl/crypto/sha/asm/keccak1600-c64x.pl

883 lines
22 KiB
Perl
Raw Normal View History

#!/usr/bin/env perl
# Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# [ABI- and endian-neutral] Keccak-1600 for C64x.
#
# June 2017.
#
# This is straightforward KECCAK_1X_ALT variant (see sha/keccak1600.c)
# with bit interleaving. 64-bit values are simply split between A- and
# B-files, with A-file holding least significant halves. This works
# out perfectly, because all operations including cross-communications
# [in rotate operations] are always complementary. Performance is
# [incredible for a 32-bit processor] 10.9 cycles per processed byte
# for r=1088, which corresponds to SHA3-256. This is >15x faster than
# compiler-generated KECCAK_1X_ALT code, and >10x than other variants.
# On average processor ends up issuing ~4.5 instructions per cycle...
my @A = map([ $_, ($_+1), ($_+2), ($_+3), ($_+4) ], (5,10,16,21,26));
$A[1][4] = 31; # B14 is reserved, A14 is used as iota[]
($A[3][0],$A[4][1]) = ($A[4][1],$A[3][0]);
my @C = (0..4,$A[3][0],$A[4][0]);
my $iotas = "A14";
my @rhotates = ([ 0, 1, 62, 28, 27 ],
[ 36, 44, 6, 55, 20 ],
[ 3, 10, 43, 25, 39 ],
[ 41, 45, 15, 21, 8 ],
[ 18, 2, 61, 56, 14 ]);
sub ROL64 {
my ($src,$rot,$dst,$p) = @_;
if ($rot&1) {
$code.=<<___;
$p ROTL B$src,$rot/2+1,A$dst
|| ROTL A$src,$rot/2, B$dst
___
} else {
$code.=<<___;
$p ROTL A$src,$rot/2,A$dst
|| ROTL B$src,$rot/2,B$dst
___
}
}
########################################################################
# Stack frame layout
#
# SP--->+------+------+
# | | |
# +1--->+------+------+<- -9 below 4 slots are used by KeccakF1600_int
# | | |
# +2--->+------+------+<- -8
# | | |
# +3--->+------+------+<- -7
# | A2 | A3 | A3:A2 are preserved by KeccakF1600_int
# +4--->+------+------+<- -6
# | B2 | B3 | B3:B2 are preserved by KeccakF1600_int
# +5--->+------+------+<- -5 below is ABI-compliant layout
# | A10 | A11 |
# +6--->+------+------+<- -4
# | A12 | A13 |
# +7--->+------+------+<- -3
# | A14 | B3 |
# +8--->+------+------+<- -2
# | B10 | B11 |
# +9--->+------+------+<- -1
# | B12 | B13 |
# +------+------+<---FP
# | A15 |
# +------+--
$code.=<<___;
.text
.if .ASSEMBLER_VERSION<7000000
.asg 0,__TI_EABI__
.endif
.if __TI_EABI__
.nocmp
.asg KeccakF1600,_KeccakF1600
.asg SHA3_absorb,_SHA3_absorb
.asg SHA3_squeeze,_SHA3_squeeze
.endif
.asg B3,RA
.asg A15,FP
.asg B15,SP
.align 32
_KeccakF1600_int:
.asmfunc
STDW A3:A2,*FP[-7]
|| STDW B3:B2,*SP[4]
_KeccakF1600_cheat:
.if __TI_EABI__
ADDKPC _KeccakF1600_int,B0
|| MVKL \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
MVKH \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas
.else
ADDKPC _KeccakF1600_int,B0
|| MVKL (iotas-_KeccakF1600_int),$iotas
MVKH (iotas-_KeccakF1600_int),$iotas
.endif
ADD B0,$iotas,$iotas
loop?:
XOR A$A[0][2],A$A[1][2],A$C[2] ; Theta
|| XOR B$A[0][2],B$A[1][2],B$C[2]
|| XOR A$A[0][3],A$A[1][3],A$C[3]
|| XOR B$A[0][3],B$A[1][3],B$C[3]
|| XOR A$A[0][0],A$A[1][0],A$C[0]
|| XOR B$A[0][0],B$A[1][0],B$C[0]
XOR A$A[2][2],A$C[2],A$C[2]
|| XOR B$A[2][2],B$C[2],B$C[2]
|| XOR A$A[2][3],A$C[3],A$C[3]
|| XOR B$A[2][3],B$C[3],B$C[3]
|| XOR A$A[2][0],A$C[0],A$C[0]
|| XOR B$A[2][0],B$C[0],B$C[0]
XOR A$A[3][2],A$C[2],A$C[2]
|| XOR B$A[3][2],B$C[2],B$C[2]
|| XOR A$A[3][3],A$C[3],A$C[3]
|| XOR B$A[3][3],B$C[3],B$C[3]
|| XOR A$A[3][0],A$C[0],A$C[0]
|| XOR B$A[3][0],B$C[0],B$C[0]
XOR A$A[4][2],A$C[2],A$C[2]
|| XOR B$A[4][2],B$C[2],B$C[2]
|| XOR A$A[4][3],A$C[3],A$C[3]
|| XOR B$A[4][3],B$C[3],B$C[3]
|| XOR A$A[4][0],A$C[0],A$C[0]
|| XOR B$A[4][0],B$C[0],B$C[0]
XOR A$A[0][4],A$A[1][4],A$C[4]
|| XOR B$A[0][4],B$A[1][4],B$C[4]
|| XOR A$A[0][1],A$A[1][1],A$C[1]
|| XOR B$A[0][1],B$A[1][1],B$C[1]
|| STDW A$A[3][0]:A$A[4][0],*SP[1] ; offload some data
STDW B$A[3][0]:B$A[4][0],*SP[2]
|| XOR A$A[2][4],A$C[4],A$C[4]
|| XOR B$A[2][4],B$C[4],B$C[4]
|| XOR A$A[2][1],A$C[1],A$C[1]
|| XOR B$A[2][1],B$C[1],B$C[1]
|| ROTL B$C[2],1,A$C[5] ; ROL64(C[2],1)
|| ROTL A$C[2],0,B$C[5]
XOR A$A[3][4],A$C[4],A$C[4]
|| XOR B$A[3][4],B$C[4],B$C[4]
|| XOR A$A[3][1],A$C[1],A$C[1]
|| XOR B$A[3][1],B$C[1],B$C[1]
|| ROTL B$C[3],1,A$C[6] ; ROL64(C[3],1)
|| ROTL A$C[3],0,B$C[6]
XOR A$A[4][4],A$C[4],A$C[4]
|| XOR B$A[4][4],B$C[4],B$C[4]
|| XOR A$A[4][1],A$C[1],A$C[1]
|| XOR B$A[4][1],B$C[1],B$C[1]
|| XOR A$C[0],A$C[5],A$C[5] ; C[0] ^ ROL64(C[2],1)
|| XOR B$C[0],B$C[5],B$C[5]
XOR A$C[5],A$A[0][1],A$A[0][1]
|| XOR B$C[5],B$A[0][1],B$A[0][1]
|| XOR A$C[5],A$A[1][1],A$A[1][1]
|| XOR B$C[5],B$A[1][1],B$A[1][1]
|| XOR A$C[5],A$A[2][1],A$A[2][1]
|| XOR B$C[5],B$A[2][1],B$A[2][1]
XOR A$C[5],A$A[3][1],A$A[3][1]
|| XOR B$C[5],B$A[3][1],B$A[3][1]
|| XOR A$C[5],A$A[4][1],A$A[4][1]
|| XOR B$C[5],B$A[4][1],B$A[4][1]
|| ROTL B$C[4],1,A$C[5] ; ROL64(C[4],1)
|| ROTL A$C[4],0,B$C[5]
|| XOR A$C[1],A$C[6],A$C[6] ; C[1] ^ ROL64(C[3],1)
|| XOR B$C[1],B$C[6],B$C[6]
XOR A$C[6],A$A[0][2],A$A[0][2]
|| XOR B$C[6],B$A[0][2],B$A[0][2]
|| XOR A$C[6],A$A[1][2],A$A[1][2]
|| XOR B$C[6],B$A[1][2],B$A[1][2]
|| XOR A$C[6],A$A[2][2],A$A[2][2]
|| XOR B$C[6],B$A[2][2],B$A[2][2]
|| ROTL B$C[1],1,A$C[1] ; ROL64(C[1],1)
|| ROTL A$C[1],0,B$C[1]
XOR A$C[6],A$A[3][2],A$A[3][2]
|| XOR B$C[6],B$A[3][2],B$A[3][2]
|| XOR A$C[6],A$A[4][2],A$A[4][2]
|| XOR B$C[6],B$A[4][2],B$A[4][2]
|| ROTL B$C[0],1,A$C[6] ; ROL64(C[0],1)
|| ROTL A$C[0],0,B$C[6]
|| XOR A$C[5],A$C[2],A$C[2] ; C[2] ^= ROL64(C[4],1)
|| XOR B$C[5],B$C[2],B$C[2]
XOR A$C[2],A$A[0][3],A$A[0][3]
|| XOR B$C[2],B$A[0][3],B$A[0][3]
|| XOR A$C[2],A$A[1][3],A$A[1][3]
|| XOR B$C[2],B$A[1][3],B$A[1][3]
|| XOR A$C[2],A$A[2][3],A$A[2][3]
|| XOR B$C[2],B$A[2][3],B$A[2][3]
XOR A$C[6],A$C[3],A$C[3] ; C[3] ^= ROL64(C[0],1)
|| XOR B$C[6],B$C[3],B$C[3]
|| LDDW *FP[-9],A$A[3][0]:A$A[4][0] ; restore offloaded data
|| LDDW *SP[2],B$A[3][0]:B$A[4][0]
|| XOR A$C[2],A$A[3][3],A$A[3][3]
|| XOR B$C[2],B$A[3][3],B$A[3][3]
XOR A$C[2],A$A[4][3],A$A[4][3]
|| XOR B$C[2],B$A[4][3],B$A[4][3]
|| XOR A$C[3],A$A[0][4],A$A[0][4]
|| XOR B$C[3],B$A[0][4],B$A[0][4]
|| XOR A$C[3],A$A[1][4],A$A[1][4]
|| XOR B$C[3],B$A[1][4],B$A[1][4]
XOR A$C[3],A$A[2][4],A$A[2][4]
|| XOR B$C[3],B$A[2][4],B$A[2][4]
|| XOR A$C[3],A$A[3][4],A$A[3][4]
|| XOR B$C[3],B$A[3][4],B$A[3][4]
|| XOR A$C[3],A$A[4][4],A$A[4][4]
|| XOR B$C[3],B$A[4][4],B$A[4][4]
XOR A$C[1],A$C[4],A$C[4] ; C[4] ^= ROL64(C[1],1)
|| XOR B$C[1],B$C[4],B$C[4]
|| MV A$A[0][1],A$C[1] ; Rho+Pi, "early start"
|| MV B$A[0][1],B$C[1]
___
&ROL64 ($A[1][1],$rhotates[1][1],$A[0][1],"||");
$code.=<<___;
XOR A$C[4],A$A[0][0],A$A[0][0]
|| XOR B$C[4],B$A[0][0],B$A[0][0]
|| XOR A$C[4],A$A[1][0],A$A[1][0]
|| XOR B$C[4],B$A[1][0],B$A[1][0]
|| MV A$A[0][3],A$C[3]
|| MV B$A[0][3],B$C[3]
___
&ROL64 ($A[3][3],$rhotates[3][3],$A[0][3],"||");
$code.=<<___;
XOR A$C[4],A$A[2][0],A$A[2][0]
|| XOR B$C[4],B$A[2][0],B$A[2][0]
|| XOR A$C[4],A$A[3][0],A$A[3][0]
|| XOR B$C[4],B$A[3][0],B$A[3][0]
|| MV A$A[0][2],A$C[2]
|| MV B$A[0][2],B$C[2]
___
&ROL64 ($A[2][2],$rhotates[2][2],$A[0][2],"||");
$code.=<<___;
XOR A$C[4],A$A[4][0],A$A[4][0]
|| XOR B$C[4],B$A[4][0],B$A[4][0]
|| MV A$A[0][4],A$C[4]
|| MV B$A[0][4],B$C[4]
___
&ROL64 ($A[4][4],$rhotates[4][4],$A[0][4],"||");
&ROL64 ($A[1][4],$rhotates[1][4],$A[1][1]);
$code.=<<___;
|| LDW *${iotas}++[2],A$C[0]
___
&ROL64 ($A[2][3],$rhotates[2][3],$A[2][2]);
$code.=<<___;
|| LDW *${iotas}[-1],B$C[0]
___
&ROL64 ($A[3][2],$rhotates[3][2],$A[3][3]);
&ROL64 ($A[4][1],$rhotates[4][1],$A[4][4]);
&ROL64 ($A[4][2],$rhotates[4][2],$A[1][4]);
&ROL64 ($A[3][4],$rhotates[3][4],$A[2][3]);
&ROL64 ($A[2][1],$rhotates[2][1],$A[3][2]);
&ROL64 ($A[1][3],$rhotates[1][3],$A[4][1]);
&ROL64 ($A[2][4],$rhotates[2][4],$A[4][2]);
&ROL64 ($A[4][3],$rhotates[4][3],$A[3][4]);
&ROL64 ($A[1][2],$rhotates[1][2],$A[2][1]);
&ROL64 ($A[3][1],$rhotates[3][1],$A[1][3]);
&ROL64 ($A[4][0],$rhotates[4][0],$A[2][4]);
&ROL64 ($A[3][0],$rhotates[3][0],$A[4][3]);
&ROL64 ($A[2][0],$rhotates[2][0],$A[1][2]);
&ROL64 ($A[1][0],$rhotates[1][0],$A[3][1]);
#&ROL64 ($C[3], $rhotates[0][3],$A[1][0]); # moved below
&ROL64 ($C[1], $rhotates[0][1],$A[2][0]);
&ROL64 ($C[4], $rhotates[0][4],$A[3][0]);
&ROL64 ($C[2], $rhotates[0][2],$A[4][0]);
$code.=<<___;
|| ANDN A$A[0][2],A$A[0][1],A$C[4] ; Chi+Iota
|| ANDN B$A[0][2],B$A[0][1],B$C[4]
|| ANDN A$A[0][3],A$A[0][2],A$C[1]
|| ANDN B$A[0][3],B$A[0][2],B$C[1]
|| ANDN A$A[0][4],A$A[0][3],A$C[2]
|| ANDN B$A[0][4],B$A[0][3],B$C[2]
___
&ROL64 ($C[3], $rhotates[0][3],$A[1][0]);
$code.=<<___;
|| ANDN A$A[0][0],A$A[0][4],A$C[3]
|| ANDN B$A[0][0],B$A[0][4],B$C[3]
|| XOR A$C[4],A$A[0][0],A$A[0][0]
|| XOR B$C[4],B$A[0][0],B$A[0][0]
|| ANDN A$A[0][1],A$A[0][0],A$C[4]
|| ANDN B$A[0][1],B$A[0][0],B$C[4]
XOR A$C[1],A$A[0][1],A$A[0][1]
|| XOR B$C[1],B$A[0][1],B$A[0][1]
|| XOR A$C[2],A$A[0][2],A$A[0][2]
|| XOR B$C[2],B$A[0][2],B$A[0][2]
|| XOR A$C[3],A$A[0][3],A$A[0][3]
|| XOR B$C[3],B$A[0][3],B$A[0][3]
XOR A$C[4],A$A[0][4],A$A[0][4]
|| XOR B$C[4],B$A[0][4],B$A[0][4]
|| XOR A$C[0],A$A[0][0],A$A[0][0] ; A[0][0] ^= iotas[i++];
|| XOR B$C[0],B$A[0][0],B$A[0][0]
|| EXTU $iotas,24,24,A0 ; A0 is A$C[0], as we done?
ANDN A$A[1][2],A$A[1][1],A$C[4]
|| ANDN B$A[1][2],B$A[1][1],B$C[4]
|| ANDN A$A[1][3],A$A[1][2],A$C[1]
|| ANDN B$A[1][3],B$A[1][2],B$C[1]
|| ANDN A$A[1][4],A$A[1][3],A$C[2]
|| ANDN B$A[1][4],B$A[1][3],B$C[2]
ANDN A$A[1][0],A$A[1][4],A$C[3]
|| ANDN B$A[1][0],B$A[1][4],B$C[3]
|| XOR A$C[4],A$A[1][0],A$A[1][0]
|| XOR B$C[4],B$A[1][0],B$A[1][0]
|| ANDN A$A[1][1],A$A[1][0],A$C[4]
|| ANDN B$A[1][1],B$A[1][0],B$C[4]
XOR A$C[1],A$A[1][1],A$A[1][1]
|| XOR B$C[1],B$A[1][1],B$A[1][1]
|| XOR A$C[2],A$A[1][2],A$A[1][2]
|| XOR B$C[2],B$A[1][2],B$A[1][2]
|| XOR A$C[3],A$A[1][3],A$A[1][3]
|| XOR B$C[3],B$A[1][3],B$A[1][3]
XOR A$C[4],A$A[1][4],A$A[1][4]
|| XOR B$C[4],B$A[1][4],B$A[1][4]
|| ANDN A$A[2][2],A$A[2][1],A$C[4]
|| ANDN B$A[2][2],B$A[2][1],B$C[4]
|| ANDN A$A[2][3],A$A[2][2],A$C[1]
|| ANDN B$A[2][3],B$A[2][2],B$C[1]
ANDN A$A[2][4],A$A[2][3],A$C[2]
|| ANDN B$A[2][4],B$A[2][3],B$C[2]
|| ANDN A$A[2][0],A$A[2][4],A$C[3]
|| ANDN B$A[2][0],B$A[2][4],B$C[3]
|| XOR A$C[4],A$A[2][0],A$A[2][0]
|| XOR B$C[4],B$A[2][0],B$A[2][0]
ANDN A$A[2][1],A$A[2][0],A$C[4]
|| ANDN B$A[2][1],B$A[2][0],B$C[4]
|| XOR A$C[1],A$A[2][1],A$A[2][1]
|| XOR B$C[1],B$A[2][1],B$A[2][1]
|| XOR A$C[2],A$A[2][2],A$A[2][2]
|| XOR B$C[2],B$A[2][2],B$A[2][2]
XOR A$C[3],A$A[2][3],A$A[2][3]
|| XOR B$C[3],B$A[2][3],B$A[2][3]
|| XOR A$C[4],A$A[2][4],A$A[2][4]
|| XOR B$C[4],B$A[2][4],B$A[2][4]
ANDN A$A[3][2],A$A[3][1],A$C[4]
|| ANDN B$A[3][2],B$A[3][1],B$C[4]
|| ANDN A$A[3][3],A$A[3][2],A$C[1]
|| ANDN B$A[3][3],B$A[3][2],B$C[1]
|| ANDN A$A[3][4],A$A[3][3],A$C[2]
|| ANDN B$A[3][4],B$A[3][3],B$C[2]
ANDN A$A[3][0],A$A[3][4],A$C[3]
|| ANDN B$A[3][0],B$A[3][4],B$C[3]
|| XOR A$C[4],A$A[3][0],A$A[3][0]
|| XOR B$C[4],B$A[3][0],B$A[3][0]
|| ANDN A$A[3][1],A$A[3][0],A$C[4]
|| ANDN B$A[3][1],B$A[3][0],B$C[4]
XOR A$C[1],A$A[3][1],A$A[3][1]
|| XOR B$C[1],B$A[3][1],B$A[3][1]
|| XOR A$C[2],A$A[3][2],A$A[3][2]
|| XOR B$C[2],B$A[3][2],B$A[3][2]
|| XOR A$C[3],A$A[3][3],A$A[3][3]
||[A0] BNOP loop?
XOR B$C[3],B$A[3][3],B$A[3][3]
|| XOR A$C[4],A$A[3][4],A$A[3][4]
|| XOR B$C[4],B$A[3][4],B$A[3][4]
||[!A0] LDDW *FP[-7],A3:A2
||[!A0] LDDW *SP[4], RA:B2
ANDN A$A[4][2],A$A[4][1],A$C[4]
|| ANDN B$A[4][2],B$A[4][1],B$C[4]
|| ANDN A$A[4][3],A$A[4][2],A$C[1]
|| ANDN B$A[4][3],B$A[4][2],B$C[1]
|| ANDN A$A[4][4],A$A[4][3],A$C[2]
|| ANDN B$A[4][4],B$A[4][3],B$C[2]
ANDN A$A[4][0],A$A[4][4],A$C[3]
|| ANDN B$A[4][0],B$A[4][4],B$C[3]
|| XOR A$C[4],A$A[4][0],A$A[4][0]
|| XOR B$C[4],B$A[4][0],B$A[4][0]
|| ANDN A$A[4][1],A$A[4][0],A$C[4]
|| ANDN B$A[4][1],B$A[4][0],B$C[4]
XOR A$C[1],A$A[4][1],A$A[4][1]
|| XOR B$C[1],B$A[4][1],B$A[4][1]
|| XOR A$C[2],A$A[4][2],A$A[4][2]
|| XOR B$C[2],B$A[4][2],B$A[4][2]
|| XOR A$C[3],A$A[4][3],A$A[4][3]
|| XOR B$C[3],B$A[4][3],B$A[4][3]
XOR A$C[4],A$A[4][4],A$A[4][4]
|| XOR B$C[4],B$A[4][4],B$A[4][4]
;;===== branch to loop? is taken here
BNOP RA,5
.endasmfunc
.newblock
.global _KeccakF1600
.align 32
_KeccakF1600:
.asmfunc stack_usage(80)
STW FP,*SP--(80) ; save frame pointer
|| MV SP,FP
STDW B13:B12,*SP[9]
|| STDW A13:A12,*FP[-4]
STDW B11:B10,*SP[8]
|| STDW A11:A10,*FP[-5]
STW RA, *SP[15]
|| STW A14,*FP[-6]
|| MV A4,A2
|| ADD 4,A4,B2
LDW *A2++[2],A$A[0][0] ; load A[5][5]
|| LDW *B2++[2],B$A[0][0]
LDW *A2++[2],A$A[0][1]
|| LDW *B2++[2],B$A[0][1]
LDW *A2++[2],A$A[0][2]
|| LDW *B2++[2],B$A[0][2]
LDW *A2++[2],A$A[0][3]
|| LDW *B2++[2],B$A[0][3]
LDW *A2++[2],A$A[0][4]
|| LDW *B2++[2],B$A[0][4]
LDW *A2++[2],A$A[1][0]
|| LDW *B2++[2],B$A[1][0]
LDW *A2++[2],A$A[1][1]
|| LDW *B2++[2],B$A[1][1]
LDW *A2++[2],A$A[1][2]
|| LDW *B2++[2],B$A[1][2]
LDW *A2++[2],A$A[1][3]
|| LDW *B2++[2],B$A[1][3]
LDW *A2++[2],A$A[1][4]
|| LDW *B2++[2],B$A[1][4]
LDW *A2++[2],A$A[2][0]
|| LDW *B2++[2],B$A[2][0]
LDW *A2++[2],A$A[2][1]
|| LDW *B2++[2],B$A[2][1]
LDW *A2++[2],A$A[2][2]
|| LDW *B2++[2],B$A[2][2]
LDW *A2++[2],A$A[2][3]
|| LDW *B2++[2],B$A[2][3]
LDW *A2++[2],A$A[2][4]
|| LDW *B2++[2],B$A[2][4]
LDW *A2++[2],A$A[3][0]
|| LDW *B2++[2],B$A[3][0]
LDW *A2++[2],A$A[3][1]
|| LDW *B2++[2],B$A[3][1]
LDW *A2++[2],A$A[3][2]
|| LDW *B2++[2],B$A[3][2]
LDW *A2++[2],A$A[3][3]
|| LDW *B2++[2],B$A[3][3]
LDW *A2++[2],A$A[3][4]
|| LDW *B2++[2],B$A[3][4]
|| BNOP _KeccakF1600_int
ADDKPC ret?,RA
|| LDW *A2++[2],A$A[4][0]
|| LDW *B2++[2],B$A[4][0]
LDW *A2++[2],A$A[4][1]
|| LDW *B2++[2],B$A[4][1]
LDW *A2++[2],A$A[4][2]
|| LDW *B2++[2],B$A[4][2]
LDW *A2++[2],A$A[4][3]
|| LDW *B2++[2],B$A[4][3]
LDW *A2,A$A[4][4]
|| LDW *B2,B$A[4][4]
|| ADDK -192,A2 ; rewind
|| ADDK -192,B2
.align 16
ret?:
STW A$A[0][0],*A2++[2] ; store A[5][5]
|| STW B$A[0][0],*B2++[2]
STW A$A[0][1],*A2++[2]
|| STW B$A[0][1],*B2++[2]
STW A$A[0][2],*A2++[2]
|| STW B$A[0][2],*B2++[2]
STW A$A[0][3],*A2++[2]
|| STW B$A[0][3],*B2++[2]
STW A$A[0][4],*A2++[2]
|| STW B$A[0][4],*B2++[2]
STW A$A[1][0],*A2++[2]
|| STW B$A[1][0],*B2++[2]
STW A$A[1][1],*A2++[2]
|| STW B$A[1][1],*B2++[2]
STW A$A[1][2],*A2++[2]
|| STW B$A[1][2],*B2++[2]
STW A$A[1][3],*A2++[2]
|| STW B$A[1][3],*B2++[2]
STW A$A[1][4],*A2++[2]
|| STW B$A[1][4],*B2++[2]
STW A$A[2][0],*A2++[2]
|| STW B$A[2][0],*B2++[2]
STW A$A[2][1],*A2++[2]
|| STW B$A[2][1],*B2++[2]
STW A$A[2][2],*A2++[2]
|| STW B$A[2][2],*B2++[2]
STW A$A[2][3],*A2++[2]
|| STW B$A[2][3],*B2++[2]
STW A$A[2][4],*A2++[2]
|| STW B$A[2][4],*B2++[2]
STW A$A[3][0],*A2++[2]
|| STW B$A[3][0],*B2++[2]
STW A$A[3][1],*A2++[2]
|| STW B$A[3][1],*B2++[2]
STW A$A[3][2],*A2++[2]
|| STW B$A[3][2],*B2++[2]
STW A$A[3][3],*A2++[2]
|| STW B$A[3][3],*B2++[2]
STW A$A[3][4],*A2++[2]
|| STW B$A[3][4],*B2++[2]
LDW *SP[15],RA
|| LDW *FP[-6],A14
STW A$A[4][0],*A2++[2]
|| STW B$A[4][0],*B2++[2]
STW A$A[4][1],*A2++[2]
|| STW B$A[4][1],*B2++[2]
STW A$A[4][2],*A2++[2]
|| STW B$A[4][2],*B2++[2]
STW A$A[4][3],*A2++[2]
|| STW B$A[4][3],*B2++[2]
STW A$A[4][4],*A2
|| STW B$A[4][4],*B2
|| ADDK -192,A2 ; rewind
MV A2,A4 ; return original A4
|| LDDW *SP[8], B11:B10
|| LDDW *FP[-5],A11:A10
LDDW *SP[9], B13:B12
|| LDDW *FP[-4],A13:A12
|| BNOP RA
LDW *++SP(80),FP ; restore frame pointer
NOP 4 ; wait till FP is committed
.endasmfunc
.newblock
.asg B2,BSZ
.asg A2,INP
.asg A3,LEN
.global _SHA3_absorb
.align 32
_SHA3_absorb:
.asmfunc stack_usage(80)
STW FP,*SP--(80) ; save frame pointer
|| MV SP,FP
STDW B13:B12,*SP[9]
|| STDW A13:A12,*FP[-4]
STDW B11:B10,*SP[8]
|| STDW A11:A10,*FP[-5]
STW RA, *SP[15]
|| STW A14,*FP[-6]
STW A4,*SP[1] ; save A[][]
|| MV B4,INP ; reassign arguments
|| MV A6,LEN
|| MV B6,BSZ
|| ADD 4,A4,B4
LDW *A4++[2],A$A[0][0] ; load A[5][5]
|| LDW *B4++[2],B$A[0][0]
LDW *A4++[2],A$A[0][1]
|| LDW *B4++[2],B$A[0][1]
LDW *A4++[2],A$A[0][2]
|| LDW *B4++[2],B$A[0][2]
LDW *A4++[2],A$A[0][3]
|| LDW *B4++[2],B$A[0][3]
LDW *A4++[2],A$A[0][4]
|| LDW *B4++[2],B$A[0][4]
LDW *A4++[2],A$A[1][0]
|| LDW *B4++[2],B$A[1][0]
LDW *A4++[2],A$A[1][1]
|| LDW *B4++[2],B$A[1][1]
LDW *A4++[2],A$A[1][2]
|| LDW *B4++[2],B$A[1][2]
LDW *A4++[2],A$A[1][3]
|| LDW *B4++[2],B$A[1][3]
LDW *A4++[2],A$A[1][4]
|| LDW *B4++[2],B$A[1][4]
LDW *A4++[2],A$A[2][0]
|| LDW *B4++[2],B$A[2][0]
LDW *A4++[2],A$A[2][1]
|| LDW *B4++[2],B$A[2][1]
LDW *A4++[2],A$A[2][2]
|| LDW *B4++[2],B$A[2][2]
LDW *A4++[2],A$A[2][3]
|| LDW *B4++[2],B$A[2][3]
LDW *A4++[2],A$A[2][4]
|| LDW *B4++[2],B$A[2][4]
LDW *A4++[2],A$A[3][0]
|| LDW *B4++[2],B$A[3][0]
LDW *A4++[2],A$A[3][1]
|| LDW *B4++[2],B$A[3][1]
LDW *A4++[2],A$A[3][2]
|| LDW *B4++[2],B$A[3][2]
LDW *A4++[2],A$A[3][3]
|| LDW *B4++[2],B$A[3][3]
LDW *A4++[2],A$A[3][4]
|| LDW *B4++[2],B$A[3][4]
LDW *A4++[2],A$A[4][0]
|| LDW *B4++[2],B$A[4][0]
LDW *A4++[2],A$A[4][1]
|| LDW *B4++[2],B$A[4][1]
LDW *A4++[2],A$A[4][2]
|| LDW *B4++[2],B$A[4][2]
LDW *A4++[2],A$A[4][3]
|| LDW *B4++[2],B$A[4][3]
LDW *A4,A$A[4][4]
|| LDW *B4,B$A[4][4]
|| ADDKPC loop?,RA
STDW RA:BSZ,*SP[4]
loop?:
CMPLTU LEN,BSZ,A0 ; len < bsz?
|| SHRU BSZ,3,BSZ
[A0] BNOP ret?
||[A0] ZERO BSZ
||[A0] LDW *SP[1],A2 ; pull A[][]
[BSZ] LDNDW *INP++,A1:A0
||[BSZ] SUB LEN,8,LEN
||[BSZ] SUB BSZ,1,BSZ
NOP 4
___
for ($y = 0; $y < 5; $y++) {
for ($x = 0; $x < ($y<4 ? 5 : 4); $x++) {
$code.=<<___;
.if .BIG_ENDIAN
SWAP2 A0,A1
|| SWAP2 A1,A0
SWAP4 A0,A0
SWAP4 A1,A1
||[!BSZ]BNOP _KeccakF1600_cheat
||[!BSZ]STDW LEN:INP,*SP[3]
|| DEAL A0,A0
.else
[!BSZ]BNOP _KeccakF1600_cheat
||[!BSZ]STDW LEN:INP,*SP[3]
|| DEAL A0,A0
.endif
[BSZ] LDNDW *INP++,A1:A0
|| DEAL A1,A1
[BSZ] SUB LEN,8,LEN
||[BSZ] SUB BSZ,1,BSZ
PACK2 A1,A0,A0
|| PACKH2 A1,A0,A1
XOR A0,A$A[$y][$x],A$A[$y][$x]
XOR A1,B$A[$y][$x],B$A[$y][$x]
___
}
}
$code.=<<___;
.if .BIG_ENDIAN
SWAP2 A0,A1
|| SWAP2 A1,A0
SWAP4 A0,A0
SWAP4 A1,A1
.endif
BNOP _KeccakF1600_cheat
|| STDW LEN:INP,*SP[3]
|| DEAL A0,A0
DEAL A1,A1
NOP
PACK2 A1,A0,A0
|| PACKH2 A1,A0,A1
XOR A0,A$A[4][4],A$A[4][4]
XOR A1,B$A[4][4],B$A[4][4]
.align 16
ret?:
MV LEN,A4 ; return value
|| ADD 4,A2,B2
STW A$A[0][0],*A2++[2] ; store A[5][5]
|| STW B$A[0][0],*B2++[2]
STW A$A[0][1],*A2++[2]
|| STW B$A[0][1],*B2++[2]
STW A$A[0][2],*A2++[2]
|| STW B$A[0][2],*B2++[2]
STW A$A[0][3],*A2++[2]
|| STW B$A[0][3],*B2++[2]
STW A$A[0][4],*A2++[2]
|| STW B$A[0][4],*B2++[2]
STW A$A[1][0],*A2++[2]
|| STW B$A[1][0],*B2++[2]
STW A$A[1][1],*A2++[2]
|| STW B$A[1][1],*B2++[2]
STW A$A[1][2],*A2++[2]
|| STW B$A[1][2],*B2++[2]
STW A$A[1][3],*A2++[2]
|| STW B$A[1][3],*B2++[2]
STW A$A[1][4],*A2++[2]
|| STW B$A[1][4],*B2++[2]
STW A$A[2][0],*A2++[2]
|| STW B$A[2][0],*B2++[2]
STW A$A[2][1],*A2++[2]
|| STW B$A[2][1],*B2++[2]
STW A$A[2][2],*A2++[2]
|| STW B$A[2][2],*B2++[2]
STW A$A[2][3],*A2++[2]
|| STW B$A[2][3],*B2++[2]
STW A$A[2][4],*A2++[2]
|| STW B$A[2][4],*B2++[2]
LDW *SP[15],RA
|| LDW *FP[-6],A14
STW A$A[3][0],*A2++[2]
|| STW B$A[3][0],*B2++[2]
STW A$A[3][1],*A2++[2]
|| STW B$A[3][1],*B2++[2]
STW A$A[3][2],*A2++[2]
|| STW B$A[3][2],*B2++[2]
STW A$A[3][3],*A2++[2]
|| STW B$A[3][3],*B2++[2]
STW A$A[3][4],*A2++[2]
|| STW B$A[3][4],*B2++[2]
LDDW *SP[8], B11:B10
|| LDDW *FP[-5],A11:A10
LDDW *SP[9], B13:B12
|| LDDW *FP[-4],A13:A12
BNOP RA
|| LDW *++SP(80),FP ; restore frame pointer
STW A$A[4][0],*A2++[2]
|| STW B$A[4][0],*B2++[2]
STW A$A[4][1],*A2++[2]
|| STW B$A[4][1],*B2++[2]
STW A$A[4][2],*A2++[2]
|| STW B$A[4][2],*B2++[2]
STW A$A[4][3],*A2++[2]
|| STW B$A[4][3],*B2++[2]
STW A$A[4][4],*A2++[2]
|| STW B$A[4][4],*B2++[2]
.endasmfunc
.newblock
.global _SHA3_squeeze
.asg A12,OUT
.asg A13,LEN
.asg A14,BSZ
.align 32
_SHA3_squeeze:
.asmfunc stack_usage(24)
STW FP,*SP--(24) ; save frame pointer
|| MV SP,FP
STW RA, *SP[5]
|| STW A14,*FP[-2]
STDW A13:A12,*FP[-2]
|| MV B4,OUT ; reassign arguments
MV A6,LEN
|| MV B6,BSZ
loop?:
LDW *SP[5],RA ; reload RA
|| SHRU BSZ,3,A1
|| MV A4,A8
|| ADD 4,A4,B8
block?:
CMPLTU LEN,8,A0 ; len < 8?
[A0] BNOP tail?
LDW *A8++[2],A9
|| LDW *B8++[2],B9
|| SUB LEN,8,LEN ; len -= 8
MV LEN,A0
|| SUB A1,1,A1 ; bsz--
|| NOP 4
.if .BIG_ENDIAN
SWAP4 A9,A9
|| SWAP4 B9,B9
SWAP2 A9,A9
|| SWAP2 B9,B9
.endif
[!A0] BNOP ret?
||[!A0] ZERO A1
PACK2 B9,A9,B7
||[A1] BNOP block?
PACKH2 B9,A9,B9
|| SHFL B7,B7
SHFL B9,B9
STNW B7,*OUT++
STNW B9,*OUT++
NOP
BNOP _KeccakF1600,4
ADDKPC loop?,RA
.align 16
tail?:
.if .BIG_ENDIAN
SWAP4 A9,A9
|| SWAP4 B9,B9
SWAP2 A9,A9
|| SWAP2 B9,B9
.endif
PACK2 B9,A9,B7
PACKH2 B9,A9,B9
|| SHFL B7,B7
SHFL B9,B9
STB B7,*OUT++
|| SHRU B7,8,B7
|| ADD LEN,7,A0
[A0] STB B7,*OUT++
||[A0] SHRU B7,8,B7
||[A0] SUB A0,1,A0
[A0] STB B7,*OUT++
||[A0] SHRU B7,8,B7
||[A0] SUB A0,1,A0
[A0] STB B7,*OUT++
||[A0] SUB A0,1,A0
[A0] STB B9,*OUT++
||[A0] SHRU B9,8,B9
||[A0] SUB A0,1,A0
[A0] STB B9,*OUT++
||[A0] SHRU B9,8,B9
||[A0] SUB A0,1,A0
[A0] STB B9,*OUT++
ret?:
LDDW *FP[-2],A13:A12
BNOP RA
|| LDW *FP[-2],A14
LDW *++SP(24),FP ; restore frame pointer
NOP 4 ; wait till FP is committed
.endasmfunc
.if __TI_EABI__
.sect ".text:sha_asm.const"
.else
.sect ".const:sha_asm"
.endif
.align 256
.uword 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
iotas:
.uword 0x00000001, 0x00000000
.uword 0x00000000, 0x00000089
.uword 0x00000000, 0x8000008b
.uword 0x00000000, 0x80008080
.uword 0x00000001, 0x0000008b
.uword 0x00000001, 0x00008000
.uword 0x00000001, 0x80008088
.uword 0x00000001, 0x80000082
.uword 0x00000000, 0x0000000b
.uword 0x00000000, 0x0000000a
.uword 0x00000001, 0x00008082
.uword 0x00000000, 0x00008003
.uword 0x00000001, 0x0000808b
.uword 0x00000001, 0x8000000b
.uword 0x00000001, 0x8000008a
.uword 0x00000001, 0x80000081
.uword 0x00000000, 0x80000081
.uword 0x00000000, 0x80000008
.uword 0x00000000, 0x00000083
.uword 0x00000000, 0x80008003
.uword 0x00000001, 0x80008088
.uword 0x00000000, 0x80000088
.uword 0x00000001, 0x00008000
.uword 0x00000000, 0x80008082
.cstring "Keccak-1600 absorb and squeeze for C64x, CRYPTOGAMS by <appro\@openssl.org>"
.align 4
___
print $code;