947716c187
MIPS[32|64]R6 is binary and source incompatible with previous MIPS ISA specifications. Fortunately it's still possible to resolve differences in source code with standard pre-processor and switching to trap-free version of addition and subtraction instructions. Reviewed-by: Richard Levitte <levitte@openssl.org>
437 lines
8.8 KiB
Raku
Executable file
437 lines
8.8 KiB
Raku
Executable file
#! /usr/bin/env perl
|
|
# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the OpenSSL license (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
|
|
# Poly1305 hash for MIPS64.
|
|
#
|
|
# May 2016
|
|
#
|
|
# Numbers are cycles per processed byte with poly1305_blocks alone.
|
|
#
|
|
# IALU/gcc
|
|
# R1x000 5.64/+120% (big-endian)
|
|
# Octeon II 3.80/+280% (little-endian)
|
|
|
|
######################################################################
|
|
# There is a number of MIPS ABI in use, O32 and N32/64 are most
|
|
# widely used. Then there is a new contender: NUBI. It appears that if
|
|
# one picks the latter, it's possible to arrange code in ABI neutral
|
|
# manner. Therefore let's stick to NUBI register layout:
|
|
#
|
|
($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
|
|
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
|
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
|
|
($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
|
|
#
|
|
# The return value is placed in $a0. Following coding rules facilitate
|
|
# interoperability:
|
|
#
|
|
# - never ever touch $tp, "thread pointer", former $gp [o32 can be
|
|
# excluded from the rule, because it's specified volatile];
|
|
# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
|
|
# old code];
|
|
# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
|
|
#
|
|
# For reference here is register layout for N32/64 MIPS ABIs:
|
|
#
|
|
# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
|
|
# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
|
# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
|
|
# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
|
|
# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
|
|
#
|
|
# <appro@openssl.org>
|
|
#
|
|
######################################################################
|
|
|
|
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
|
|
|
|
die "MIPS64 only" unless ($flavour =~ /64|n32/i);
|
|
|
|
$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
|
|
$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
|
|
|
|
($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
|
|
($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
|
|
|
|
$code.=<<___;
|
|
#include "mips_arch.h"
|
|
|
|
#ifdef MIPSEB
|
|
# define MSB 0
|
|
# define LSB 7
|
|
#else
|
|
# define MSB 7
|
|
# define LSB 0
|
|
#endif
|
|
|
|
.text
|
|
.set noat
|
|
.set noreorder
|
|
|
|
.align 5
|
|
.globl poly1305_init
|
|
.ent poly1305_init
|
|
poly1305_init:
|
|
.frame $sp,0,$ra
|
|
.set reorder
|
|
|
|
sd $zero,0($ctx)
|
|
sd $zero,8($ctx)
|
|
sd $zero,16($ctx)
|
|
|
|
beqz $inp,.Lno_key
|
|
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
ld $in0,0($inp)
|
|
ld $in1,8($inp)
|
|
#else
|
|
ldl $in0,0+MSB($inp)
|
|
ldl $in1,8+MSB($inp)
|
|
ldr $in0,0+LSB($inp)
|
|
ldr $in1,8+LSB($inp)
|
|
#endif
|
|
#ifdef MIPSEB
|
|
# if defined(_MIPS_ARCH_MIPS64R2)
|
|
dsbh $in0,$in0 # byte swap
|
|
dsbh $in1,$in1
|
|
dshd $in0,$in0
|
|
dshd $in1,$in1
|
|
# else
|
|
ori $tmp0,$zero,0xFF
|
|
dsll $tmp2,$tmp0,32
|
|
or $tmp0,$tmp2 # 0x000000FF000000FF
|
|
|
|
and $tmp1,$in0,$tmp0 # byte swap
|
|
and $tmp3,$in1,$tmp0
|
|
dsrl $tmp2,$in0,24
|
|
dsrl $tmp4,$in1,24
|
|
dsll $tmp1,24
|
|
dsll $tmp3,24
|
|
and $tmp2,$tmp0
|
|
and $tmp4,$tmp0
|
|
dsll $tmp0,8 # 0x0000FF000000FF00
|
|
or $tmp1,$tmp2
|
|
or $tmp3,$tmp4
|
|
and $tmp2,$in0,$tmp0
|
|
and $tmp4,$in1,$tmp0
|
|
dsrl $in0,8
|
|
dsrl $in1,8
|
|
dsll $tmp2,8
|
|
dsll $tmp4,8
|
|
and $in0,$tmp0
|
|
and $in1,$tmp0
|
|
or $tmp1,$tmp2
|
|
or $tmp3,$tmp4
|
|
or $in0,$tmp1
|
|
or $in1,$tmp3
|
|
dsrl $tmp1,$in0,32
|
|
dsrl $tmp3,$in1,32
|
|
dsll $in0,32
|
|
dsll $in1,32
|
|
or $in0,$tmp1
|
|
or $in1,$tmp3
|
|
# endif
|
|
#endif
|
|
li $tmp0,1
|
|
dsll $tmp0,32
|
|
daddiu $tmp0,-63
|
|
dsll $tmp0,28
|
|
daddiu $tmp0,-1 # 0ffffffc0fffffff
|
|
|
|
and $in0,$tmp0
|
|
daddiu $tmp0,-3 # 0ffffffc0ffffffc
|
|
and $in1,$tmp0
|
|
|
|
sd $in0,24($ctx)
|
|
dsrl $tmp0,$in1,2
|
|
sd $in1,32($ctx)
|
|
daddu $tmp0,$in1 # s1 = r1 + (r1 >> 2)
|
|
sd $tmp0,40($ctx)
|
|
|
|
.Lno_key:
|
|
li $v0,0 # return 0
|
|
jr $ra
|
|
.end poly1305_init
|
|
___
|
|
{
|
|
my ($h0,$h1,$h2,$r0,$r1,$s1,$d0,$d1,$d2) =
|
|
($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
|
|
|
|
$code.=<<___;
|
|
.align 5
|
|
.globl poly1305_blocks
|
|
.ent poly1305_blocks
|
|
poly1305_blocks:
|
|
.set noreorder
|
|
dsrl $len,4 # number of complete blocks
|
|
bnez $len,poly1305_blocks_internal
|
|
nop
|
|
jr $ra
|
|
nop
|
|
.end poly1305_blocks
|
|
|
|
.align 5
|
|
.ent poly1305_blocks_internal
|
|
poly1305_blocks_internal:
|
|
.frame $sp,6*8,$ra
|
|
.mask $SAVED_REGS_MASK,-8
|
|
.set noreorder
|
|
dsubu $sp,6*8
|
|
sd $s5,40($sp)
|
|
sd $s4,32($sp)
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
|
|
sd $s3,24($sp)
|
|
sd $s2,16($sp)
|
|
sd $s1,8($sp)
|
|
sd $s0,0($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
|
|
ld $h0,0($ctx) # load hash value
|
|
ld $h1,8($ctx)
|
|
ld $h2,16($ctx)
|
|
|
|
ld $r0,24($ctx) # load key
|
|
ld $r1,32($ctx)
|
|
ld $s1,40($ctx)
|
|
|
|
.Loop:
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
ld $in0,0($inp) # load input
|
|
ld $in1,8($inp)
|
|
#else
|
|
ldl $in0,0+MSB($inp) # load input
|
|
ldl $in1,8+MSB($inp)
|
|
ldr $in0,0+LSB($inp)
|
|
ldr $in1,8+LSB($inp)
|
|
#endif
|
|
daddiu $len,-1
|
|
daddiu $inp,16
|
|
#ifdef MIPSEB
|
|
# if defined(_MIPS_ARCH_MIPS64R2)
|
|
dsbh $in0,$in0 # byte swap
|
|
dsbh $in1,$in1
|
|
dshd $in0,$in0
|
|
dshd $in1,$in1
|
|
# else
|
|
ori $tmp0,$zero,0xFF
|
|
dsll $tmp2,$tmp0,32
|
|
or $tmp0,$tmp2 # 0x000000FF000000FF
|
|
|
|
and $tmp1,$in0,$tmp0 # byte swap
|
|
and $tmp3,$in1,$tmp0
|
|
dsrl $tmp2,$in0,24
|
|
dsrl $tmp4,$in1,24
|
|
dsll $tmp1,24
|
|
dsll $tmp3,24
|
|
and $tmp2,$tmp0
|
|
and $tmp4,$tmp0
|
|
dsll $tmp0,8 # 0x0000FF000000FF00
|
|
or $tmp1,$tmp2
|
|
or $tmp3,$tmp4
|
|
and $tmp2,$in0,$tmp0
|
|
and $tmp4,$in1,$tmp0
|
|
dsrl $in0,8
|
|
dsrl $in1,8
|
|
dsll $tmp2,8
|
|
dsll $tmp4,8
|
|
and $in0,$tmp0
|
|
and $in1,$tmp0
|
|
or $tmp1,$tmp2
|
|
or $tmp3,$tmp4
|
|
or $in0,$tmp1
|
|
or $in1,$tmp3
|
|
dsrl $tmp1,$in0,32
|
|
dsrl $tmp3,$in1,32
|
|
dsll $in0,32
|
|
dsll $in1,32
|
|
or $in0,$tmp1
|
|
or $in1,$tmp3
|
|
# endif
|
|
#endif
|
|
daddu $h0,$in0 # accumulate input
|
|
daddu $h1,$in1
|
|
sltu $tmp0,$h0,$in0
|
|
sltu $tmp1,$h1,$in1
|
|
daddu $h1,$tmp0
|
|
|
|
dmultu ($r0,$h0) # h0*r0
|
|
daddu $h2,$padbit
|
|
sltu $tmp0,$h1,$tmp0
|
|
mflo ($d0,$r0,$h0)
|
|
mfhi ($d1,$r0,$h0)
|
|
|
|
dmultu ($s1,$h1) # h1*5*r1
|
|
daddu $tmp0,$tmp1
|
|
daddu $h2,$tmp0
|
|
mflo ($tmp0,$s1,$h1)
|
|
mfhi ($tmp1,$s1,$h1)
|
|
|
|
dmultu ($r1,$h0) # h0*r1
|
|
daddu $d0,$tmp0
|
|
daddu $d1,$tmp1
|
|
mflo ($tmp2,$r1,$h0)
|
|
mfhi ($d2,$r1,$h0)
|
|
sltu $tmp0,$d0,$tmp0
|
|
daddu $d1,$tmp0
|
|
|
|
dmultu ($r0,$h1) # h1*r0
|
|
daddu $d1,$tmp2
|
|
sltu $tmp2,$d1,$tmp2
|
|
mflo ($tmp0,$r0,$h1)
|
|
mfhi ($tmp1,$r0,$h1)
|
|
daddu $d2,$tmp2
|
|
|
|
dmultu ($s1,$h2) # h2*5*r1
|
|
daddu $d1,$tmp0
|
|
daddu $d2,$tmp1
|
|
mflo ($tmp2,$s1,$h2)
|
|
|
|
dmultu ($r0,$h2) # h2*r0
|
|
sltu $tmp0,$d1,$tmp0
|
|
daddu $d2,$tmp0
|
|
mflo ($tmp3,$r0,$h2)
|
|
|
|
daddu $d1,$tmp2
|
|
daddu $d2,$tmp3
|
|
sltu $tmp2,$d1,$tmp2
|
|
daddu $d2,$tmp2
|
|
|
|
li $tmp0,-4 # final reduction
|
|
and $tmp0,$d2
|
|
dsrl $tmp1,$d2,2
|
|
andi $h2,$d2,3
|
|
daddu $tmp0,$tmp1
|
|
daddu $h0,$d0,$tmp0
|
|
sltu $tmp0,$h0,$tmp0
|
|
daddu $h1,$d1,$tmp0
|
|
sltu $tmp0,$h1,$tmp0
|
|
daddu $h2,$h2,$tmp0
|
|
|
|
bnez $len,.Loop
|
|
|
|
sd $h0,0($ctx) # store hash value
|
|
sd $h1,8($ctx)
|
|
sd $h2,16($ctx)
|
|
|
|
.set noreorder
|
|
ld $s5,40($sp) # epilogue
|
|
ld $s4,32($sp)
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi epilogue
|
|
ld $s3,24($sp)
|
|
ld $s2,16($sp)
|
|
ld $s1,8($sp)
|
|
ld $s0,0($sp)
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
daddu $sp,6*8
|
|
.end poly1305_blocks_internal
|
|
___
|
|
}
|
|
{
|
|
my ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
|
|
|
|
$code.=<<___;
|
|
.align 5
|
|
.globl poly1305_emit
|
|
.ent poly1305_emit
|
|
poly1305_emit:
|
|
.frame $sp,0,$ra
|
|
.set reorder
|
|
|
|
ld $tmp0,0($ctx)
|
|
ld $tmp1,8($ctx)
|
|
ld $tmp2,16($ctx)
|
|
|
|
daddiu $in0,$tmp0,5 # compare to modulus
|
|
sltiu $tmp3,$in0,5
|
|
daddu $in1,$tmp1,$tmp3
|
|
sltu $tmp3,$in1,$tmp3
|
|
daddu $tmp2,$tmp2,$tmp3
|
|
|
|
dsrl $tmp2,2 # see if it carried/borrowed
|
|
dsubu $tmp2,$zero,$tmp2
|
|
nor $tmp3,$zero,$tmp2
|
|
|
|
and $in0,$tmp2
|
|
and $tmp0,$tmp3
|
|
and $in1,$tmp2
|
|
and $tmp1,$tmp3
|
|
or $in0,$tmp0
|
|
or $in1,$tmp1
|
|
|
|
lwu $tmp0,0($nonce) # load nonce
|
|
lwu $tmp1,4($nonce)
|
|
lwu $tmp2,8($nonce)
|
|
lwu $tmp3,12($nonce)
|
|
dsll $tmp1,32
|
|
dsll $tmp3,32
|
|
or $tmp0,$tmp1
|
|
or $tmp2,$tmp3
|
|
|
|
daddu $in0,$tmp0 # accumulate nonce
|
|
daddu $in1,$tmp2
|
|
sltu $tmp0,$in0,$tmp0
|
|
daddu $in1,$tmp0
|
|
|
|
dsrl $tmp0,$in0,8 # write mac value
|
|
dsrl $tmp1,$in0,16
|
|
dsrl $tmp2,$in0,24
|
|
sb $in0,0($mac)
|
|
dsrl $tmp3,$in0,32
|
|
sb $tmp0,1($mac)
|
|
dsrl $tmp0,$in0,40
|
|
sb $tmp1,2($mac)
|
|
dsrl $tmp1,$in0,48
|
|
sb $tmp2,3($mac)
|
|
dsrl $tmp2,$in0,56
|
|
sb $tmp3,4($mac)
|
|
dsrl $tmp3,$in1,8
|
|
sb $tmp0,5($mac)
|
|
dsrl $tmp0,$in1,16
|
|
sb $tmp1,6($mac)
|
|
dsrl $tmp1,$in1,24
|
|
sb $tmp2,7($mac)
|
|
|
|
sb $in1,8($mac)
|
|
dsrl $tmp2,$in1,32
|
|
sb $tmp3,9($mac)
|
|
dsrl $tmp3,$in1,40
|
|
sb $tmp0,10($mac)
|
|
dsrl $tmp0,$in1,48
|
|
sb $tmp1,11($mac)
|
|
dsrl $tmp1,$in1,56
|
|
sb $tmp2,12($mac)
|
|
sb $tmp3,13($mac)
|
|
sb $tmp0,14($mac)
|
|
sb $tmp1,15($mac)
|
|
|
|
jr $ra
|
|
.end poly1305_emit
|
|
.rdata
|
|
.asciiz "Poly1305 for MIPS64, CRYPTOGAMS by <appro\@openssl.org>"
|
|
.align 2
|
|
___
|
|
}
|
|
|
|
$output=pop and open STDOUT,">$output";
|
|
print $code;
|
|
close STDOUT;
|
|
|