#!/usr/bin/env perl # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # [ABI- and endian-neutral] Keccak-1600 for C64x. # # June 2017. # # This is straightforward KECCAK_1X_ALT variant (see sha/keccak1600.c) # with bit interleaving. 64-bit values are simply split between A- and # B-files, with A-file holding least significant halves. This works # out perfectly, because all operations including cross-communications # [in rotate operations] are always complementary. Performance is # [incredible for a 32-bit processor] 10.9 cycles per processed byte # for r=1088, which corresponds to SHA3-256. This is >15x faster than # compiler-generated KECCAK_1X_ALT code, and >10x than other variants. # On average processor ends up issuing ~4.5 instructions per cycle... my @A = map([ $_, ($_+1), ($_+2), ($_+3), ($_+4) ], (5,10,16,21,26)); $A[1][4] = 31; # B14 is reserved, A14 is used as iota[] ($A[3][0],$A[4][1]) = ($A[4][1],$A[3][0]); my @C = (0..4,$A[3][0],$A[4][0]); my $iotas = "A14"; my @rhotates = ([ 0, 1, 62, 28, 27 ], [ 36, 44, 6, 55, 20 ], [ 3, 10, 43, 25, 39 ], [ 41, 45, 15, 21, 8 ], [ 18, 2, 61, 56, 14 ]); sub ROL64 { my ($src,$rot,$dst,$p) = @_; if ($rot&1) { $code.=<<___; $p ROTL B$src,$rot/2+1,A$dst || ROTL A$src,$rot/2, B$dst ___ } else { $code.=<<___; $p ROTL A$src,$rot/2,A$dst || ROTL B$src,$rot/2,B$dst ___ } } ######################################################################## # Stack frame layout # # SP--->+------+------+ # | | | # +1--->+------+------+<- -9 below 4 slots are used by KeccakF1600_int # | | | # +2--->+------+------+<- -8 # | | | # +3--->+------+------+<- -7 # | A2 | A3 | A3:A2 are preserved by KeccakF1600_int # +4--->+------+------+<- -6 # | B2 | B3 | B3:B2 are preserved by KeccakF1600_int # +5--->+------+------+<- -5 below is ABI-compliant layout # | A10 | A11 | # +6--->+------+------+<- -4 # | A12 | A13 | # +7--->+------+------+<- -3 # | A14 | B3 | # +8--->+------+------+<- -2 # | B10 | B11 | # +9--->+------+------+<- -1 # | B12 | B13 | # +------+------+<---FP # | A15 | # +------+-- $code.=<<___; .text .if .ASSEMBLER_VERSION<7000000 .asg 0,__TI_EABI__ .endif .if __TI_EABI__ .nocmp .asg KeccakF1600,_KeccakF1600 .asg SHA3_absorb,_SHA3_absorb .asg SHA3_squeeze,_SHA3_squeeze .endif .asg B3,RA .asg A15,FP .asg B15,SP .align 32 _KeccakF1600_int: .asmfunc STDW A3:A2,*FP[-7] || STDW B3:B2,*SP[4] _KeccakF1600_cheat: .if __TI_EABI__ ADDKPC _KeccakF1600_int,B0 || MVKL \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas MVKH \$PCR_OFFSET(iotas,_KeccakF1600_int),$iotas .else ADDKPC _KeccakF1600_int,B0 || MVKL (iotas-_KeccakF1600_int),$iotas MVKH (iotas-_KeccakF1600_int),$iotas .endif ADD B0,$iotas,$iotas loop?: XOR A$A[0][2],A$A[1][2],A$C[2] ; Theta || XOR B$A[0][2],B$A[1][2],B$C[2] || XOR A$A[0][3],A$A[1][3],A$C[3] || XOR B$A[0][3],B$A[1][3],B$C[3] || XOR A$A[0][0],A$A[1][0],A$C[0] || XOR B$A[0][0],B$A[1][0],B$C[0] XOR A$A[2][2],A$C[2],A$C[2] || XOR B$A[2][2],B$C[2],B$C[2] || XOR A$A[2][3],A$C[3],A$C[3] || XOR B$A[2][3],B$C[3],B$C[3] || XOR A$A[2][0],A$C[0],A$C[0] || XOR B$A[2][0],B$C[0],B$C[0] XOR A$A[3][2],A$C[2],A$C[2] || XOR B$A[3][2],B$C[2],B$C[2] || XOR A$A[3][3],A$C[3],A$C[3] || XOR B$A[3][3],B$C[3],B$C[3] || XOR A$A[3][0],A$C[0],A$C[0] || XOR B$A[3][0],B$C[0],B$C[0] XOR A$A[4][2],A$C[2],A$C[2] || XOR B$A[4][2],B$C[2],B$C[2] || XOR A$A[4][3],A$C[3],A$C[3] || XOR B$A[4][3],B$C[3],B$C[3] || XOR A$A[4][0],A$C[0],A$C[0] || XOR B$A[4][0],B$C[0],B$C[0] XOR A$A[0][4],A$A[1][4],A$C[4] || XOR B$A[0][4],B$A[1][4],B$C[4] || XOR A$A[0][1],A$A[1][1],A$C[1] || XOR B$A[0][1],B$A[1][1],B$C[1] || STDW A$A[3][0]:A$A[4][0],*SP[1] ; offload some data STDW B$A[3][0]:B$A[4][0],*SP[2] || XOR A$A[2][4],A$C[4],A$C[4] || XOR B$A[2][4],B$C[4],B$C[4] || XOR A$A[2][1],A$C[1],A$C[1] || XOR B$A[2][1],B$C[1],B$C[1] || ROTL B$C[2],1,A$C[5] ; ROL64(C[2],1) || ROTL A$C[2],0,B$C[5] XOR A$A[3][4],A$C[4],A$C[4] || XOR B$A[3][4],B$C[4],B$C[4] || XOR A$A[3][1],A$C[1],A$C[1] || XOR B$A[3][1],B$C[1],B$C[1] || ROTL B$C[3],1,A$C[6] ; ROL64(C[3],1) || ROTL A$C[3],0,B$C[6] XOR A$A[4][4],A$C[4],A$C[4] || XOR B$A[4][4],B$C[4],B$C[4] || XOR A$A[4][1],A$C[1],A$C[1] || XOR B$A[4][1],B$C[1],B$C[1] || XOR A$C[0],A$C[5],A$C[5] ; C[0] ^ ROL64(C[2],1) || XOR B$C[0],B$C[5],B$C[5] XOR A$C[5],A$A[0][1],A$A[0][1] || XOR B$C[5],B$A[0][1],B$A[0][1] || XOR A$C[5],A$A[1][1],A$A[1][1] || XOR B$C[5],B$A[1][1],B$A[1][1] || XOR A$C[5],A$A[2][1],A$A[2][1] || XOR B$C[5],B$A[2][1],B$A[2][1] XOR A$C[5],A$A[3][1],A$A[3][1] || XOR B$C[5],B$A[3][1],B$A[3][1] || XOR A$C[5],A$A[4][1],A$A[4][1] || XOR B$C[5],B$A[4][1],B$A[4][1] || ROTL B$C[4],1,A$C[5] ; ROL64(C[4],1) || ROTL A$C[4],0,B$C[5] || XOR A$C[1],A$C[6],A$C[6] ; C[1] ^ ROL64(C[3],1) || XOR B$C[1],B$C[6],B$C[6] XOR A$C[6],A$A[0][2],A$A[0][2] || XOR B$C[6],B$A[0][2],B$A[0][2] || XOR A$C[6],A$A[1][2],A$A[1][2] || XOR B$C[6],B$A[1][2],B$A[1][2] || XOR A$C[6],A$A[2][2],A$A[2][2] || XOR B$C[6],B$A[2][2],B$A[2][2] || ROTL B$C[1],1,A$C[1] ; ROL64(C[1],1) || ROTL A$C[1],0,B$C[1] XOR A$C[6],A$A[3][2],A$A[3][2] || XOR B$C[6],B$A[3][2],B$A[3][2] || XOR A$C[6],A$A[4][2],A$A[4][2] || XOR B$C[6],B$A[4][2],B$A[4][2] || ROTL B$C[0],1,A$C[6] ; ROL64(C[0],1) || ROTL A$C[0],0,B$C[6] || XOR A$C[5],A$C[2],A$C[2] ; C[2] ^= ROL64(C[4],1) || XOR B$C[5],B$C[2],B$C[2] XOR A$C[2],A$A[0][3],A$A[0][3] || XOR B$C[2],B$A[0][3],B$A[0][3] || XOR A$C[2],A$A[1][3],A$A[1][3] || XOR B$C[2],B$A[1][3],B$A[1][3] || XOR A$C[2],A$A[2][3],A$A[2][3] || XOR B$C[2],B$A[2][3],B$A[2][3] XOR A$C[6],A$C[3],A$C[3] ; C[3] ^= ROL64(C[0],1) || XOR B$C[6],B$C[3],B$C[3] || LDDW *FP[-9],A$A[3][0]:A$A[4][0] ; restore offloaded data || LDDW *SP[2],B$A[3][0]:B$A[4][0] || XOR A$C[2],A$A[3][3],A$A[3][3] || XOR B$C[2],B$A[3][3],B$A[3][3] XOR A$C[2],A$A[4][3],A$A[4][3] || XOR B$C[2],B$A[4][3],B$A[4][3] || XOR A$C[3],A$A[0][4],A$A[0][4] || XOR B$C[3],B$A[0][4],B$A[0][4] || XOR A$C[3],A$A[1][4],A$A[1][4] || XOR B$C[3],B$A[1][4],B$A[1][4] XOR A$C[3],A$A[2][4],A$A[2][4] || XOR B$C[3],B$A[2][4],B$A[2][4] || XOR A$C[3],A$A[3][4],A$A[3][4] || XOR B$C[3],B$A[3][4],B$A[3][4] || XOR A$C[3],A$A[4][4],A$A[4][4] || XOR B$C[3],B$A[4][4],B$A[4][4] XOR A$C[1],A$C[4],A$C[4] ; C[4] ^= ROL64(C[1],1) || XOR B$C[1],B$C[4],B$C[4] || MV A$A[0][1],A$C[1] ; Rho+Pi, "early start" || MV B$A[0][1],B$C[1] ___ &ROL64 ($A[1][1],$rhotates[1][1],$A[0][1],"||"); $code.=<<___; XOR A$C[4],A$A[0][0],A$A[0][0] || XOR B$C[4],B$A[0][0],B$A[0][0] || XOR A$C[4],A$A[1][0],A$A[1][0] || XOR B$C[4],B$A[1][0],B$A[1][0] || MV A$A[0][3],A$C[3] || MV B$A[0][3],B$C[3] ___ &ROL64 ($A[3][3],$rhotates[3][3],$A[0][3],"||"); $code.=<<___; XOR A$C[4],A$A[2][0],A$A[2][0] || XOR B$C[4],B$A[2][0],B$A[2][0] || XOR A$C[4],A$A[3][0],A$A[3][0] || XOR B$C[4],B$A[3][0],B$A[3][0] || MV A$A[0][2],A$C[2] || MV B$A[0][2],B$C[2] ___ &ROL64 ($A[2][2],$rhotates[2][2],$A[0][2],"||"); $code.=<<___; XOR A$C[4],A$A[4][0],A$A[4][0] || XOR B$C[4],B$A[4][0],B$A[4][0] || MV A$A[0][4],A$C[4] || MV B$A[0][4],B$C[4] ___ &ROL64 ($A[4][4],$rhotates[4][4],$A[0][4],"||"); &ROL64 ($A[1][4],$rhotates[1][4],$A[1][1]); $code.=<<___; || LDW *${iotas}++[2],A$C[0] ___ &ROL64 ($A[2][3],$rhotates[2][3],$A[2][2]); $code.=<<___; || LDW *${iotas}[-1],B$C[0] ___ &ROL64 ($A[3][2],$rhotates[3][2],$A[3][3]); &ROL64 ($A[4][1],$rhotates[4][1],$A[4][4]); &ROL64 ($A[4][2],$rhotates[4][2],$A[1][4]); &ROL64 ($A[3][4],$rhotates[3][4],$A[2][3]); &ROL64 ($A[2][1],$rhotates[2][1],$A[3][2]); &ROL64 ($A[1][3],$rhotates[1][3],$A[4][1]); &ROL64 ($A[2][4],$rhotates[2][4],$A[4][2]); &ROL64 ($A[4][3],$rhotates[4][3],$A[3][4]); &ROL64 ($A[1][2],$rhotates[1][2],$A[2][1]); &ROL64 ($A[3][1],$rhotates[3][1],$A[1][3]); &ROL64 ($A[4][0],$rhotates[4][0],$A[2][4]); &ROL64 ($A[3][0],$rhotates[3][0],$A[4][3]); &ROL64 ($A[2][0],$rhotates[2][0],$A[1][2]); &ROL64 ($A[1][0],$rhotates[1][0],$A[3][1]); #&ROL64 ($C[3], $rhotates[0][3],$A[1][0]); # moved below &ROL64 ($C[1], $rhotates[0][1],$A[2][0]); &ROL64 ($C[4], $rhotates[0][4],$A[3][0]); &ROL64 ($C[2], $rhotates[0][2],$A[4][0]); $code.=<<___; || ANDN A$A[0][2],A$A[0][1],A$C[4] ; Chi+Iota || ANDN B$A[0][2],B$A[0][1],B$C[4] || ANDN A$A[0][3],A$A[0][2],A$C[1] || ANDN B$A[0][3],B$A[0][2],B$C[1] || ANDN A$A[0][4],A$A[0][3],A$C[2] || ANDN B$A[0][4],B$A[0][3],B$C[2] ___ &ROL64 ($C[3], $rhotates[0][3],$A[1][0]); $code.=<<___; || ANDN A$A[0][0],A$A[0][4],A$C[3] || ANDN B$A[0][0],B$A[0][4],B$C[3] || XOR A$C[4],A$A[0][0],A$A[0][0] || XOR B$C[4],B$A[0][0],B$A[0][0] || ANDN A$A[0][1],A$A[0][0],A$C[4] || ANDN B$A[0][1],B$A[0][0],B$C[4] XOR A$C[1],A$A[0][1],A$A[0][1] || XOR B$C[1],B$A[0][1],B$A[0][1] || XOR A$C[2],A$A[0][2],A$A[0][2] || XOR B$C[2],B$A[0][2],B$A[0][2] || XOR A$C[3],A$A[0][3],A$A[0][3] || XOR B$C[3],B$A[0][3],B$A[0][3] XOR A$C[4],A$A[0][4],A$A[0][4] || XOR B$C[4],B$A[0][4],B$A[0][4] || XOR A$C[0],A$A[0][0],A$A[0][0] ; A[0][0] ^= iotas[i++]; || XOR B$C[0],B$A[0][0],B$A[0][0] || EXTU $iotas,24,24,A0 ; A0 is A$C[0], as we done? ANDN A$A[1][2],A$A[1][1],A$C[4] || ANDN B$A[1][2],B$A[1][1],B$C[4] || ANDN A$A[1][3],A$A[1][2],A$C[1] || ANDN B$A[1][3],B$A[1][2],B$C[1] || ANDN A$A[1][4],A$A[1][3],A$C[2] || ANDN B$A[1][4],B$A[1][3],B$C[2] ANDN A$A[1][0],A$A[1][4],A$C[3] || ANDN B$A[1][0],B$A[1][4],B$C[3] || XOR A$C[4],A$A[1][0],A$A[1][0] || XOR B$C[4],B$A[1][0],B$A[1][0] || ANDN A$A[1][1],A$A[1][0],A$C[4] || ANDN B$A[1][1],B$A[1][0],B$C[4] XOR A$C[1],A$A[1][1],A$A[1][1] || XOR B$C[1],B$A[1][1],B$A[1][1] || XOR A$C[2],A$A[1][2],A$A[1][2] || XOR B$C[2],B$A[1][2],B$A[1][2] || XOR A$C[3],A$A[1][3],A$A[1][3] || XOR B$C[3],B$A[1][3],B$A[1][3] XOR A$C[4],A$A[1][4],A$A[1][4] || XOR B$C[4],B$A[1][4],B$A[1][4] || ANDN A$A[2][2],A$A[2][1],A$C[4] || ANDN B$A[2][2],B$A[2][1],B$C[4] || ANDN A$A[2][3],A$A[2][2],A$C[1] || ANDN B$A[2][3],B$A[2][2],B$C[1] ANDN A$A[2][4],A$A[2][3],A$C[2] || ANDN B$A[2][4],B$A[2][3],B$C[2] || ANDN A$A[2][0],A$A[2][4],A$C[3] || ANDN B$A[2][0],B$A[2][4],B$C[3] || XOR A$C[4],A$A[2][0],A$A[2][0] || XOR B$C[4],B$A[2][0],B$A[2][0] ANDN A$A[2][1],A$A[2][0],A$C[4] || ANDN B$A[2][1],B$A[2][0],B$C[4] || XOR A$C[1],A$A[2][1],A$A[2][1] || XOR B$C[1],B$A[2][1],B$A[2][1] || XOR A$C[2],A$A[2][2],A$A[2][2] || XOR B$C[2],B$A[2][2],B$A[2][2] XOR A$C[3],A$A[2][3],A$A[2][3] || XOR B$C[3],B$A[2][3],B$A[2][3] || XOR A$C[4],A$A[2][4],A$A[2][4] || XOR B$C[4],B$A[2][4],B$A[2][4] ANDN A$A[3][2],A$A[3][1],A$C[4] || ANDN B$A[3][2],B$A[3][1],B$C[4] || ANDN A$A[3][3],A$A[3][2],A$C[1] || ANDN B$A[3][3],B$A[3][2],B$C[1] || ANDN A$A[3][4],A$A[3][3],A$C[2] || ANDN B$A[3][4],B$A[3][3],B$C[2] ANDN A$A[3][0],A$A[3][4],A$C[3] || ANDN B$A[3][0],B$A[3][4],B$C[3] || XOR A$C[4],A$A[3][0],A$A[3][0] || XOR B$C[4],B$A[3][0],B$A[3][0] || ANDN A$A[3][1],A$A[3][0],A$C[4] || ANDN B$A[3][1],B$A[3][0],B$C[4] XOR A$C[1],A$A[3][1],A$A[3][1] || XOR B$C[1],B$A[3][1],B$A[3][1] || XOR A$C[2],A$A[3][2],A$A[3][2] || XOR B$C[2],B$A[3][2],B$A[3][2] || XOR A$C[3],A$A[3][3],A$A[3][3] ||[A0] BNOP loop? XOR B$C[3],B$A[3][3],B$A[3][3] || XOR A$C[4],A$A[3][4],A$A[3][4] || XOR B$C[4],B$A[3][4],B$A[3][4] ||[!A0] LDDW *FP[-7],A3:A2 ||[!A0] LDDW *SP[4], RA:B2 ANDN A$A[4][2],A$A[4][1],A$C[4] || ANDN B$A[4][2],B$A[4][1],B$C[4] || ANDN A$A[4][3],A$A[4][2],A$C[1] || ANDN B$A[4][3],B$A[4][2],B$C[1] || ANDN A$A[4][4],A$A[4][3],A$C[2] || ANDN B$A[4][4],B$A[4][3],B$C[2] ANDN A$A[4][0],A$A[4][4],A$C[3] || ANDN B$A[4][0],B$A[4][4],B$C[3] || XOR A$C[4],A$A[4][0],A$A[4][0] || XOR B$C[4],B$A[4][0],B$A[4][0] || ANDN A$A[4][1],A$A[4][0],A$C[4] || ANDN B$A[4][1],B$A[4][0],B$C[4] XOR A$C[1],A$A[4][1],A$A[4][1] || XOR B$C[1],B$A[4][1],B$A[4][1] || XOR A$C[2],A$A[4][2],A$A[4][2] || XOR B$C[2],B$A[4][2],B$A[4][2] || XOR A$C[3],A$A[4][3],A$A[4][3] || XOR B$C[3],B$A[4][3],B$A[4][3] XOR A$C[4],A$A[4][4],A$A[4][4] || XOR B$C[4],B$A[4][4],B$A[4][4] ;;===== branch to loop? is taken here BNOP RA,5 .endasmfunc .newblock .global _KeccakF1600 .align 32 _KeccakF1600: .asmfunc stack_usage(80) STW FP,*SP--(80) ; save frame pointer || MV SP,FP STDW B13:B12,*SP[9] || STDW A13:A12,*FP[-4] STDW B11:B10,*SP[8] || STDW A11:A10,*FP[-5] STW RA, *SP[15] || STW A14,*FP[-6] || MV A4,A2 || ADD 4,A4,B2 LDW *A2++[2],A$A[0][0] ; load A[5][5] || LDW *B2++[2],B$A[0][0] LDW *A2++[2],A$A[0][1] || LDW *B2++[2],B$A[0][1] LDW *A2++[2],A$A[0][2] || LDW *B2++[2],B$A[0][2] LDW *A2++[2],A$A[0][3] || LDW *B2++[2],B$A[0][3] LDW *A2++[2],A$A[0][4] || LDW *B2++[2],B$A[0][4] LDW *A2++[2],A$A[1][0] || LDW *B2++[2],B$A[1][0] LDW *A2++[2],A$A[1][1] || LDW *B2++[2],B$A[1][1] LDW *A2++[2],A$A[1][2] || LDW *B2++[2],B$A[1][2] LDW *A2++[2],A$A[1][3] || LDW *B2++[2],B$A[1][3] LDW *A2++[2],A$A[1][4] || LDW *B2++[2],B$A[1][4] LDW *A2++[2],A$A[2][0] || LDW *B2++[2],B$A[2][0] LDW *A2++[2],A$A[2][1] || LDW *B2++[2],B$A[2][1] LDW *A2++[2],A$A[2][2] || LDW *B2++[2],B$A[2][2] LDW *A2++[2],A$A[2][3] || LDW *B2++[2],B$A[2][3] LDW *A2++[2],A$A[2][4] || LDW *B2++[2],B$A[2][4] LDW *A2++[2],A$A[3][0] || LDW *B2++[2],B$A[3][0] LDW *A2++[2],A$A[3][1] || LDW *B2++[2],B$A[3][1] LDW *A2++[2],A$A[3][2] || LDW *B2++[2],B$A[3][2] LDW *A2++[2],A$A[3][3] || LDW *B2++[2],B$A[3][3] LDW *A2++[2],A$A[3][4] || LDW *B2++[2],B$A[3][4] || BNOP _KeccakF1600_int ADDKPC ret?,RA || LDW *A2++[2],A$A[4][0] || LDW *B2++[2],B$A[4][0] LDW *A2++[2],A$A[4][1] || LDW *B2++[2],B$A[4][1] LDW *A2++[2],A$A[4][2] || LDW *B2++[2],B$A[4][2] LDW *A2++[2],A$A[4][3] || LDW *B2++[2],B$A[4][3] LDW *A2,A$A[4][4] || LDW *B2,B$A[4][4] || ADDK -192,A2 ; rewind || ADDK -192,B2 .align 16 ret?: STW A$A[0][0],*A2++[2] ; store A[5][5] || STW B$A[0][0],*B2++[2] STW A$A[0][1],*A2++[2] || STW B$A[0][1],*B2++[2] STW A$A[0][2],*A2++[2] || STW B$A[0][2],*B2++[2] STW A$A[0][3],*A2++[2] || STW B$A[0][3],*B2++[2] STW A$A[0][4],*A2++[2] || STW B$A[0][4],*B2++[2] STW A$A[1][0],*A2++[2] || STW B$A[1][0],*B2++[2] STW A$A[1][1],*A2++[2] || STW B$A[1][1],*B2++[2] STW A$A[1][2],*A2++[2] || STW B$A[1][2],*B2++[2] STW A$A[1][3],*A2++[2] || STW B$A[1][3],*B2++[2] STW A$A[1][4],*A2++[2] || STW B$A[1][4],*B2++[2] STW A$A[2][0],*A2++[2] || STW B$A[2][0],*B2++[2] STW A$A[2][1],*A2++[2] || STW B$A[2][1],*B2++[2] STW A$A[2][2],*A2++[2] || STW B$A[2][2],*B2++[2] STW A$A[2][3],*A2++[2] || STW B$A[2][3],*B2++[2] STW A$A[2][4],*A2++[2] || STW B$A[2][4],*B2++[2] STW A$A[3][0],*A2++[2] || STW B$A[3][0],*B2++[2] STW A$A[3][1],*A2++[2] || STW B$A[3][1],*B2++[2] STW A$A[3][2],*A2++[2] || STW B$A[3][2],*B2++[2] STW A$A[3][3],*A2++[2] || STW B$A[3][3],*B2++[2] STW A$A[3][4],*A2++[2] || STW B$A[3][4],*B2++[2] LDW *SP[15],RA || LDW *FP[-6],A14 STW A$A[4][0],*A2++[2] || STW B$A[4][0],*B2++[2] STW A$A[4][1],*A2++[2] || STW B$A[4][1],*B2++[2] STW A$A[4][2],*A2++[2] || STW B$A[4][2],*B2++[2] STW A$A[4][3],*A2++[2] || STW B$A[4][3],*B2++[2] STW A$A[4][4],*A2 || STW B$A[4][4],*B2 || ADDK -192,A2 ; rewind MV A2,A4 ; return original A4 || LDDW *SP[8], B11:B10 || LDDW *FP[-5],A11:A10 LDDW *SP[9], B13:B12 || LDDW *FP[-4],A13:A12 || BNOP RA LDW *++SP(80),FP ; restore frame pointer NOP 4 ; wait till FP is committed .endasmfunc .newblock .asg B2,BSZ .asg A2,INP .asg A3,LEN .global _SHA3_absorb .align 32 _SHA3_absorb: .asmfunc stack_usage(80) STW FP,*SP--(80) ; save frame pointer || MV SP,FP STDW B13:B12,*SP[9] || STDW A13:A12,*FP[-4] STDW B11:B10,*SP[8] || STDW A11:A10,*FP[-5] STW RA, *SP[15] || STW A14,*FP[-6] STW A4,*SP[1] ; save A[][] || MV B4,INP ; reassign arguments || MV A6,LEN || MV B6,BSZ || ADD 4,A4,B4 LDW *A4++[2],A$A[0][0] ; load A[5][5] || LDW *B4++[2],B$A[0][0] LDW *A4++[2],A$A[0][1] || LDW *B4++[2],B$A[0][1] LDW *A4++[2],A$A[0][2] || LDW *B4++[2],B$A[0][2] LDW *A4++[2],A$A[0][3] || LDW *B4++[2],B$A[0][3] LDW *A4++[2],A$A[0][4] || LDW *B4++[2],B$A[0][4] LDW *A4++[2],A$A[1][0] || LDW *B4++[2],B$A[1][0] LDW *A4++[2],A$A[1][1] || LDW *B4++[2],B$A[1][1] LDW *A4++[2],A$A[1][2] || LDW *B4++[2],B$A[1][2] LDW *A4++[2],A$A[1][3] || LDW *B4++[2],B$A[1][3] LDW *A4++[2],A$A[1][4] || LDW *B4++[2],B$A[1][4] LDW *A4++[2],A$A[2][0] || LDW *B4++[2],B$A[2][0] LDW *A4++[2],A$A[2][1] || LDW *B4++[2],B$A[2][1] LDW *A4++[2],A$A[2][2] || LDW *B4++[2],B$A[2][2] LDW *A4++[2],A$A[2][3] || LDW *B4++[2],B$A[2][3] LDW *A4++[2],A$A[2][4] || LDW *B4++[2],B$A[2][4] LDW *A4++[2],A$A[3][0] || LDW *B4++[2],B$A[3][0] LDW *A4++[2],A$A[3][1] || LDW *B4++[2],B$A[3][1] LDW *A4++[2],A$A[3][2] || LDW *B4++[2],B$A[3][2] LDW *A4++[2],A$A[3][3] || LDW *B4++[2],B$A[3][3] LDW *A4++[2],A$A[3][4] || LDW *B4++[2],B$A[3][4] LDW *A4++[2],A$A[4][0] || LDW *B4++[2],B$A[4][0] LDW *A4++[2],A$A[4][1] || LDW *B4++[2],B$A[4][1] LDW *A4++[2],A$A[4][2] || LDW *B4++[2],B$A[4][2] LDW *A4++[2],A$A[4][3] || LDW *B4++[2],B$A[4][3] LDW *A4,A$A[4][4] || LDW *B4,B$A[4][4] || ADDKPC loop?,RA STDW RA:BSZ,*SP[4] loop?: CMPLTU LEN,BSZ,A0 ; len < bsz? || SHRU BSZ,3,BSZ [A0] BNOP ret? ||[A0] ZERO BSZ ||[A0] LDW *SP[1],A2 ; pull A[][] [BSZ] LDNDW *INP++,A1:A0 ||[BSZ] SUB LEN,8,LEN ||[BSZ] SUB BSZ,1,BSZ NOP 4 ___ for ($y = 0; $y < 5; $y++) { for ($x = 0; $x < ($y<4 ? 5 : 4); $x++) { $code.=<<___; .if .BIG_ENDIAN SWAP2 A0,A1 || SWAP2 A1,A0 SWAP4 A0,A0 SWAP4 A1,A1 ||[!BSZ]BNOP _KeccakF1600_cheat ||[!BSZ]STDW LEN:INP,*SP[3] || DEAL A0,A0 .else [!BSZ]BNOP _KeccakF1600_cheat ||[!BSZ]STDW LEN:INP,*SP[3] || DEAL A0,A0 .endif [BSZ] LDNDW *INP++,A1:A0 || DEAL A1,A1 [BSZ] SUB LEN,8,LEN ||[BSZ] SUB BSZ,1,BSZ PACK2 A1,A0,A0 || PACKH2 A1,A0,A1 XOR A0,A$A[$y][$x],A$A[$y][$x] XOR A1,B$A[$y][$x],B$A[$y][$x] ___ } } $code.=<<___; .if .BIG_ENDIAN SWAP2 A0,A1 || SWAP2 A1,A0 SWAP4 A0,A0 SWAP4 A1,A1 .endif BNOP _KeccakF1600_cheat || STDW LEN:INP,*SP[3] || DEAL A0,A0 DEAL A1,A1 NOP PACK2 A1,A0,A0 || PACKH2 A1,A0,A1 XOR A0,A$A[4][4],A$A[4][4] XOR A1,B$A[4][4],B$A[4][4] .align 16 ret?: MV LEN,A4 ; return value || ADD 4,A2,B2 STW A$A[0][0],*A2++[2] ; store A[5][5] || STW B$A[0][0],*B2++[2] STW A$A[0][1],*A2++[2] || STW B$A[0][1],*B2++[2] STW A$A[0][2],*A2++[2] || STW B$A[0][2],*B2++[2] STW A$A[0][3],*A2++[2] || STW B$A[0][3],*B2++[2] STW A$A[0][4],*A2++[2] || STW B$A[0][4],*B2++[2] STW A$A[1][0],*A2++[2] || STW B$A[1][0],*B2++[2] STW A$A[1][1],*A2++[2] || STW B$A[1][1],*B2++[2] STW A$A[1][2],*A2++[2] || STW B$A[1][2],*B2++[2] STW A$A[1][3],*A2++[2] || STW B$A[1][3],*B2++[2] STW A$A[1][4],*A2++[2] || STW B$A[1][4],*B2++[2] STW A$A[2][0],*A2++[2] || STW B$A[2][0],*B2++[2] STW A$A[2][1],*A2++[2] || STW B$A[2][1],*B2++[2] STW A$A[2][2],*A2++[2] || STW B$A[2][2],*B2++[2] STW A$A[2][3],*A2++[2] || STW B$A[2][3],*B2++[2] STW A$A[2][4],*A2++[2] || STW B$A[2][4],*B2++[2] LDW *SP[15],RA || LDW *FP[-6],A14 STW A$A[3][0],*A2++[2] || STW B$A[3][0],*B2++[2] STW A$A[3][1],*A2++[2] || STW B$A[3][1],*B2++[2] STW A$A[3][2],*A2++[2] || STW B$A[3][2],*B2++[2] STW A$A[3][3],*A2++[2] || STW B$A[3][3],*B2++[2] STW A$A[3][4],*A2++[2] || STW B$A[3][4],*B2++[2] LDDW *SP[8], B11:B10 || LDDW *FP[-5],A11:A10 LDDW *SP[9], B13:B12 || LDDW *FP[-4],A13:A12 BNOP RA || LDW *++SP(80),FP ; restore frame pointer STW A$A[4][0],*A2++[2] || STW B$A[4][0],*B2++[2] STW A$A[4][1],*A2++[2] || STW B$A[4][1],*B2++[2] STW A$A[4][2],*A2++[2] || STW B$A[4][2],*B2++[2] STW A$A[4][3],*A2++[2] || STW B$A[4][3],*B2++[2] STW A$A[4][4],*A2++[2] || STW B$A[4][4],*B2++[2] .endasmfunc .newblock .global _SHA3_squeeze .asg A12,OUT .asg A13,LEN .asg A14,BSZ .align 32 _SHA3_squeeze: .asmfunc stack_usage(24) STW FP,*SP--(24) ; save frame pointer || MV SP,FP STW RA, *SP[5] || STW A14,*FP[-2] STDW A13:A12,*FP[-2] || MV B4,OUT ; reassign arguments MV A6,LEN || MV B6,BSZ loop?: LDW *SP[5],RA ; reload RA || SHRU BSZ,3,A1 || MV A4,A8 || ADD 4,A4,B8 block?: CMPLTU LEN,8,A0 ; len < 8? [A0] BNOP tail? LDW *A8++[2],A9 || LDW *B8++[2],B9 || SUB LEN,8,LEN ; len -= 8 MV LEN,A0 || SUB A1,1,A1 ; bsz-- || NOP 4 .if .BIG_ENDIAN SWAP4 A9,A9 || SWAP4 B9,B9 SWAP2 A9,A9 || SWAP2 B9,B9 .endif [!A0] BNOP ret? ||[!A0] ZERO A1 PACK2 B9,A9,B7 ||[A1] BNOP block? PACKH2 B9,A9,B9 || SHFL B7,B7 SHFL B9,B9 STNW B7,*OUT++ STNW B9,*OUT++ NOP BNOP _KeccakF1600,4 ADDKPC loop?,RA .align 16 tail?: .if .BIG_ENDIAN SWAP4 A9,A9 || SWAP4 B9,B9 SWAP2 A9,A9 || SWAP2 B9,B9 .endif PACK2 B9,A9,B7 PACKH2 B9,A9,B9 || SHFL B7,B7 SHFL B9,B9 STB B7,*OUT++ || SHRU B7,8,B7 || ADD LEN,7,A0 [A0] STB B7,*OUT++ ||[A0] SHRU B7,8,B7 ||[A0] SUB A0,1,A0 [A0] STB B7,*OUT++ ||[A0] SHRU B7,8,B7 ||[A0] SUB A0,1,A0 [A0] STB B7,*OUT++ ||[A0] SUB A0,1,A0 [A0] STB B9,*OUT++ ||[A0] SHRU B9,8,B9 ||[A0] SUB A0,1,A0 [A0] STB B9,*OUT++ ||[A0] SHRU B9,8,B9 ||[A0] SUB A0,1,A0 [A0] STB B9,*OUT++ ret?: LDDW *FP[-2],A13:A12 BNOP RA || LDW *FP[-2],A14 LDW *++SP(24),FP ; restore frame pointer NOP 4 ; wait till FP is committed .endasmfunc .if __TI_EABI__ .sect ".text:sha_asm.const" .else .sect ".const:sha_asm" .endif .align 256 .uword 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 iotas: .uword 0x00000001, 0x00000000 .uword 0x00000000, 0x00000089 .uword 0x00000000, 0x8000008b .uword 0x00000000, 0x80008080 .uword 0x00000001, 0x0000008b .uword 0x00000001, 0x00008000 .uword 0x00000001, 0x80008088 .uword 0x00000001, 0x80000082 .uword 0x00000000, 0x0000000b .uword 0x00000000, 0x0000000a .uword 0x00000001, 0x00008082 .uword 0x00000000, 0x00008003 .uword 0x00000001, 0x0000808b .uword 0x00000001, 0x8000000b .uword 0x00000001, 0x8000008a .uword 0x00000001, 0x80000081 .uword 0x00000000, 0x80000081 .uword 0x00000000, 0x80000008 .uword 0x00000000, 0x00000083 .uword 0x00000000, 0x80008003 .uword 0x00000001, 0x80008088 .uword 0x00000000, 0x80000088 .uword 0x00000001, 0x00008000 .uword 0x00000000, 0x80008082 .cstring "Keccak-1600 absorb and squeeze for C64x, CRYPTOGAMS by " .align 4 ___ print $code;