#!/usr/bin/env perl # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # Poly1305 hash for MIPS64. # # May 2016 # # Numbers are cycles per processed byte with poly1305_blocks alone. # # IALU/gcc # R1x000 5.64/+120% (big-endian) # Octeon II 3.80/+280% (little-endian) ###################################################################### # There is a number of MIPS ABI in use, O32 and N32/64 are most # widely used. Then there is a new contender: NUBI. It appears that if # one picks the latter, it's possible to arrange code in ABI neutral # manner. Therefore let's stick to NUBI register layout: # ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); # # The return value is placed in $a0. Following coding rules facilitate # interoperability: # # - never ever touch $tp, "thread pointer", former $gp [o32 can be # excluded from the rule, because it's specified volatile]; # - copy return value to $t0, former $v0 [or to $a0 if you're adapting # old code]; # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; # # For reference here is register layout for N32/64 MIPS ABIs: # # ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); # # # ###################################################################### $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64 die "MIPS64 only" unless ($flavour =~ /64|n32/i); $v0 = ($flavour =~ /nubi/i) ? $a0 : $t0; $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000"; ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1); $code.=<<___; #ifdef MIPSEB # define MSB 0 # define LSB 7 #else # define MSB 7 # define LSB 0 #endif .text .set noat .set noreorder .align 5 .globl poly1305_init .ent poly1305_init poly1305_init: .frame $sp,0,$ra .set reorder sd $zero,0($ctx) sd $zero,8($ctx) sd $zero,16($ctx) beqz $inp,.Lno_key ldl $in0,0+MSB($inp) ldl $in1,8+MSB($inp) ldr $in0,0+LSB($inp) ldr $in1,8+LSB($inp) #ifdef MIPSEB # if defined(_MIPS_ARCH_MIPS64R2) dsbh $in0,$in0 # byte swap dsbh $in1,$in1 dshd $in0,$in0 dshd $in1,$in1 # else ori $tmp0,$zero,0xFF dsll $tmp2,$tmp0,32 or $tmp0,$tmp2 # 0x000000FF000000FF and $tmp1,$in0,$tmp0 # byte swap and $tmp3,$in1,$tmp0 dsrl $tmp2,$in0,24 dsrl $tmp4,$in1,24 dsll $tmp1,24 dsll $tmp3,24 and $tmp2,$tmp0 and $tmp4,$tmp0 dsll $tmp0,8 # 0x0000FF000000FF00 or $tmp1,$tmp2 or $tmp3,$tmp4 and $tmp2,$in0,$tmp0 and $tmp4,$in1,$tmp0 dsrl $in0,8 dsrl $in1,8 dsll $tmp2,8 dsll $tmp4,8 and $in0,$tmp0 and $in1,$tmp0 or $tmp1,$tmp2 or $tmp3,$tmp4 or $in0,$tmp1 or $in1,$tmp3 dsrl $tmp1,$in0,32 dsrl $tmp3,$in1,32 dsll $in0,32 dsll $in1,32 or $in0,$tmp1 or $in1,$tmp3 # endif #endif li $tmp0,1 dsll $tmp0,32 daddiu $tmp0,-63 dsll $tmp0,28 daddiu $tmp0,-1 # 0ffffffc0fffffff and $in0,$tmp0 daddiu $tmp0,-3 # 0ffffffc0ffffffc and $in1,$tmp0 sd $in0,24($ctx) dsrl $tmp0,$in1,2 sd $in1,32($ctx) daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2) sd $tmp0,40($ctx) .Lno_key: li $v0,0 # return 0 jr $ra .end poly1305_init ___ { my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) = ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2); $code.=<<___; .align 5 .globl poly1305_blocks .ent poly1305_blocks poly1305_blocks: .set noreorder dsrl $len,4 # number of complete blocks beqz $len,.Lno_data nop .frame $sp,8*8,$ra .mask $SAVED_REGS_MASK,-8 dsub $sp,8*8 sd $s5,0($sp) sd $s4,8($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue sd $s3,16($sp) sd $s2,24($sp) sd $s1,32($sp) sd $s0,40($sp) ___ $code.=<<___; .set reorder ld $h0,0($ctx) # load hash value ld $h1,8($ctx) ld $h2,16($ctx) ld $r0,24($ctx) # load key ld $r1,32($ctx) ld $s1,40($ctx) .Loop: ldl $in0,0+MSB($inp) # load input ldl $in1,8+MSB($inp) ldr $in0,0+LSB($inp) daddiu $len,-1 ldr $in1,8+LSB($inp) daddiu $inp,16 #ifdef MIPSEB # if defined(_MIPS_ARCH_MIPS64R2) dsbh $in0,$in0 # byte swap dsbh $in1,$in1 dshd $in0,$in0 dshd $in1,$in1 # else ori $tmp0,$zero,0xFF dsll $tmp2,$tmp0,32 or $tmp0,$tmp2 # 0x000000FF000000FF and $tmp1,$in0,$tmp0 # byte swap and $tmp3,$in1,$tmp0 dsrl $tmp2,$in0,24 dsrl $tmp4,$in1,24 dsll $tmp1,24 dsll $tmp3,24 and $tmp2,$tmp0 and $tmp4,$tmp0 dsll $tmp0,8 # 0x0000FF000000FF00 or $tmp1,$tmp2 or $tmp3,$tmp4 and $tmp2,$in0,$tmp0 and $tmp4,$in1,$tmp0 dsrl $in0,8 dsrl $in1,8 dsll $tmp2,8 dsll $tmp4,8 and $in0,$tmp0 and $in1,$tmp0 or $tmp1,$tmp2 or $tmp3,$tmp4 or $in0,$tmp1 or $in1,$tmp3 dsrl $tmp1,$in0,32 dsrl $tmp3,$in1,32 dsll $in0,32 dsll $in1,32 or $in0,$tmp1 or $in1,$tmp3 # endif #endif daddu $h0,$in0 # accumulate input daddu $h1,$in1 sltu $tmp0,$h0,$in0 sltu $tmp1,$h1,$in1 daddu $h1,$tmp0 dmultu $r0,$h0 # h0*r0 daddu $h2,$padbit sltu $tmp0,$h1,$tmp0 mflo $d0 mfhi $d1 dmultu $s1,$h1 # h1*5*r1 daddu $tmp0,$tmp1 daddu $h2,$tmp0 mflo $tmp0 mfhi $tmp1 dmultu $r1,$h0 # h0*r1 daddu $d0,$tmp0 daddu $d1,$tmp1 mflo $tmp2 mfhi $d2 sltu $tmp0,$d0,$tmp0 daddu $d1,$tmp0 dmultu $r0,$h1 # h1*r0 daddu $d1,$tmp2 sltu $tmp2,$d1,$tmp2 mflo $tmp0 mfhi $tmp1 daddu $d2,$tmp2 dmultu $s1,$h2 # h2*5*r1 daddu $d1,$tmp0 daddu $d2,$tmp1 mflo $tmp2 dmultu $r0,$h2 # h2*r0 sltu $tmp0,$d1,$tmp0 daddu $d2,$tmp0 mflo $tmp3 daddu $d1,$tmp2 daddu $d2,$tmp3 sltu $tmp2,$d1,$tmp2 daddu $d2,$tmp2 li $tmp0,-4 # final reduction and $tmp0,$d2 dsrl $tmp1,$d2,2 andi $h2,$d2,3 daddu $tmp0,$tmp1 daddu $h0,$d0,$tmp0 sltu $tmp0,$h0,$tmp0 daddu $h1,$d1,$tmp0 sltu $tmp0,$h1,$tmp0 daddu $h2,$h2,$tmp0 bnez $len,.Loop sd $h0,0($ctx) # store hash value sd $h1,8($ctx) sd $h2,16($ctx) .set noreorder ld $s5,0($sp) # epilogue ld $s4,8($sp) ___ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue ld $s3,16($sp) ld $s2,24($sp) ld $s1,32($sp) ld $s0,40($sp) ___ $code.=<<___; dadd $sp,8*8 .Lno_data: jr $ra nop .end poly1305_blocks ___ } { my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); $code.=<<___; .align 5 .globl poly1305_emit .ent poly1305_emit poly1305_emit: .frame $sp,0,$ra .set reorder ld $tmp0,0($ctx) ld $tmp1,8($ctx) ld $tmp2,16($ctx) daddiu $in0,$tmp0,5 # compare to modulus sltiu $tmp3,$in0,5 daddu $in1,$tmp1,$tmp3 sltu $tmp3,$in1,$tmp3 daddu $tmp2,$tmp2,$tmp3 dsrl $tmp2,2 # see if it carried/borrowed dsubu $tmp2,$zero,$tmp2 nor $tmp3,$zero,$tmp2 and $in0,$tmp2 and $tmp0,$tmp3 and $in1,$tmp2 and $tmp1,$tmp3 or $in0,$tmp0 or $in1,$tmp1 lwu $tmp0,0($nonce) # load nonce lwu $tmp1,4($nonce) lwu $tmp2,8($nonce) lwu $tmp3,12($nonce) dsll $tmp1,32 dsll $tmp3,32 or $tmp0,$tmp1 or $tmp2,$tmp3 daddu $in0,$tmp0 # accumulate nonce daddu $in1,$tmp2 sltu $tmp0,$in0,$tmp0 daddu $in1,$tmp0 dsrl $tmp0,$in0,8 # write mac value dsrl $tmp1,$in0,16 dsrl $tmp2,$in0,24 sb $in0,0($mac) dsrl $tmp3,$in0,32 sb $tmp0,1($mac) dsrl $tmp0,$in0,40 sb $tmp1,2($mac) dsrl $tmp1,$in0,48 sb $tmp2,3($mac) dsrl $tmp2,$in0,56 sb $tmp3,4($mac) dsrl $tmp3,$in1,8 sb $tmp0,5($mac) dsrl $tmp0,$in1,16 sb $tmp1,6($mac) dsrl $tmp1,$in1,24 sb $tmp2,7($mac) sb $in1,8($mac) dsrl $tmp2,$in1,32 sb $tmp3,9($mac) dsrl $tmp3,$in1,40 sb $tmp0,10($mac) dsrl $tmp0,$in1,48 sb $tmp1,11($mac) dsrl $tmp1,$in1,56 sb $tmp2,12($mac) sb $tmp3,13($mac) sb $tmp0,14($mac) sb $tmp1,15($mac) jr $ra .end poly1305_emit .rdata .asciiz "Poly1305 for MIPS64, CRYPTOGAMS by " .align 2 ___ } $output=pop and open STDOUT,">$output"; print $code; close STDOUT;