openssl/crypto/ec/asm/ecp_nistz256-ppc64.pl
Andy Polyakov 6f553edba8 ec/asm/ecp_nistz256-ppc64.pl: minor POWER8-specific optimization.
Up to 4% depending on benchmark.

Reviewed-by: Rich Salz <rsalz@openssl.org>
2017-01-21 22:28:34 +01:00

2375 lines
51 KiB
Raku
Executable file

#! /usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# ECP_NISTZ256 module for PPC64.
#
# August 2016.
#
# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
# http://eprint.iacr.org/2013/816.
#
# with/without -DECP_NISTZ256_ASM
# POWER7 +260-530%
# POWER8 +220-340%
$flavour = shift;
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
my $sp="r1";
{
my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3,
$acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) =
map("r$_",(3..12,22..31));
my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont
$code.=<<___;
.machine "any"
.text
___
########################################################################
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
#
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
open TABLE,"<ecp_nistz256_table.c" or
open TABLE,"<${dir}../ecp_nistz256_table.c" or
die "failed to open ecp_nistz256_table.c:",$!;
use integer;
foreach(<TABLE>) {
s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
}
close TABLE;
# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
# 64*16*37-1 is because $#arr returns last valid index or @arr, not
# amount of elements.
die "insane number of elements" if ($#arr != 64*16*37-1);
$code.=<<___;
.type ecp_nistz256_precomputed,\@object
.globl ecp_nistz256_precomputed
.align 12
ecp_nistz256_precomputed:
___
########################################################################
# this conversion smashes P256_POINT_AFFINE by individual bytes with
# 64 byte interval, similar to
# 1111222233334444
# 1234123412341234
for(1..37) {
@tbl = splice(@arr,0,64*16);
for($i=0;$i<64;$i++) {
undef @line;
for($j=0;$j<64;$j++) {
push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
}
$code.=".byte\t";
$code.=join(',',map { sprintf "0x%02x",$_} @line);
$code.="\n";
}
}
$code.=<<___;
.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
.asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
# void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4],
# const BN_ULONG x2[4]);
.globl ecp_nistz256_mul_mont
.align 5
ecp_nistz256_mul_mont:
stdu $sp,-128($sp)
mflr r0
std r22,48($sp)
std r23,56($sp)
std r24,64($sp)
std r25,72($sp)
std r26,80($sp)
std r27,88($sp)
std r28,96($sp)
std r29,104($sp)
std r30,112($sp)
std r31,120($sp)
ld $a0,0($ap)
ld $bi,0($bp)
ld $a1,8($ap)
ld $a2,16($ap)
ld $a3,24($ap)
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
bl __ecp_nistz256_mul_mont
mtlr r0
ld r22,48($sp)
ld r23,56($sp)
ld r24,64($sp)
ld r25,72($sp)
ld r26,80($sp)
ld r27,88($sp)
ld r28,96($sp)
ld r29,104($sp)
ld r30,112($sp)
ld r31,120($sp)
addi $sp,$sp,128
blr
.long 0
.byte 0,12,4,0,0x80,10,3,0
.long 0
.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
# void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl ecp_nistz256_sqr_mont
.align 4
ecp_nistz256_sqr_mont:
stdu $sp,-128($sp)
mflr r0
std r22,48($sp)
std r23,56($sp)
std r24,64($sp)
std r25,72($sp)
std r26,80($sp)
std r27,88($sp)
std r28,96($sp)
std r29,104($sp)
std r30,112($sp)
std r31,120($sp)
ld $a0,0($ap)
ld $a1,8($ap)
ld $a2,16($ap)
ld $a3,24($ap)
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
bl __ecp_nistz256_sqr_mont
mtlr r0
ld r22,48($sp)
ld r23,56($sp)
ld r24,64($sp)
ld r25,72($sp)
ld r26,80($sp)
ld r27,88($sp)
ld r28,96($sp)
ld r29,104($sp)
ld r30,112($sp)
ld r31,120($sp)
addi $sp,$sp,128
blr
.long 0
.byte 0,12,4,0,0x80,10,2,0
.long 0
.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
# void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4],
# const BN_ULONG x2[4]);
.globl ecp_nistz256_add
.align 4
ecp_nistz256_add:
stdu $sp,-128($sp)
mflr r0
std r28,96($sp)
std r29,104($sp)
std r30,112($sp)
std r31,120($sp)
ld $acc0,0($ap)
ld $t0, 0($bp)
ld $acc1,8($ap)
ld $t1, 8($bp)
ld $acc2,16($ap)
ld $t2, 16($bp)
ld $acc3,24($ap)
ld $t3, 24($bp)
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
bl __ecp_nistz256_add
mtlr r0
ld r28,96($sp)
ld r29,104($sp)
ld r30,112($sp)
ld r31,120($sp)
addi $sp,$sp,128
blr
.long 0
.byte 0,12,4,0,0x80,4,3,0
.long 0
.size ecp_nistz256_add,.-ecp_nistz256_add
# void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl ecp_nistz256_div_by_2
.align 4
ecp_nistz256_div_by_2:
stdu $sp,-128($sp)
mflr r0
std r28,96($sp)
std r29,104($sp)
std r30,112($sp)
std r31,120($sp)
ld $acc0,0($ap)
ld $acc1,8($ap)
ld $acc2,16($ap)
ld $acc3,24($ap)
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
bl __ecp_nistz256_div_by_2
mtlr r0
ld r28,96($sp)
ld r29,104($sp)
ld r30,112($sp)
ld r31,120($sp)
addi $sp,$sp,128
blr
.long 0
.byte 0,12,4,0,0x80,4,2,0
.long 0
.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
# void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl ecp_nistz256_mul_by_2
.align 4
ecp_nistz256_mul_by_2:
stdu $sp,-128($sp)
mflr r0
std r28,96($sp)
std r29,104($sp)
std r30,112($sp)
std r31,120($sp)
ld $acc0,0($ap)
ld $acc1,8($ap)
ld $acc2,16($ap)
ld $acc3,24($ap)
mr $t0,$acc0
mr $t1,$acc1
mr $t2,$acc2
mr $t3,$acc3
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
bl __ecp_nistz256_add # ret = a+a // 2*a
mtlr r0
ld r28,96($sp)
ld r29,104($sp)
ld r30,112($sp)
ld r31,120($sp)
addi $sp,$sp,128
blr
.long 0
.byte 0,12,4,0,0x80,4,3,0
.long 0
.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
# void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl ecp_nistz256_mul_by_3
.align 4
ecp_nistz256_mul_by_3:
stdu $sp,-128($sp)
mflr r0
std r28,96($sp)
std r29,104($sp)
std r30,112($sp)
std r31,120($sp)
ld $acc0,0($ap)
ld $acc1,8($ap)
ld $acc2,16($ap)
ld $acc3,24($ap)
mr $t0,$acc0
std $acc0,64($sp)
mr $t1,$acc1
std $acc1,72($sp)
mr $t2,$acc2
std $acc2,80($sp)
mr $t3,$acc3
std $acc3,88($sp)
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
bl __ecp_nistz256_add # ret = a+a // 2*a
ld $t0,64($sp)
ld $t1,72($sp)
ld $t2,80($sp)
ld $t3,88($sp)
bl __ecp_nistz256_add # ret += a // 2*a+a=3*a
mtlr r0
ld r28,96($sp)
ld r29,104($sp)
ld r30,112($sp)
ld r31,120($sp)
addi $sp,$sp,128
blr
.long 0
.byte 0,12,4,0,0x80,4,2,0
.long 0
.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
# void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4],
# const BN_ULONG x2[4]);
.globl ecp_nistz256_sub
.align 4
ecp_nistz256_sub:
stdu $sp,-128($sp)
mflr r0
std r28,96($sp)
std r29,104($sp)
std r30,112($sp)
std r31,120($sp)
ld $acc0,0($ap)
ld $acc1,8($ap)
ld $acc2,16($ap)
ld $acc3,24($ap)
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
bl __ecp_nistz256_sub_from
mtlr r0
ld r28,96($sp)
ld r29,104($sp)
ld r30,112($sp)
ld r31,120($sp)
addi $sp,$sp,128
blr
.long 0
.byte 0,12,4,0,0x80,4,3,0
.long 0
.size ecp_nistz256_sub,.-ecp_nistz256_sub
# void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]);
.globl ecp_nistz256_neg
.align 4
ecp_nistz256_neg:
stdu $sp,-128($sp)
mflr r0
std r28,96($sp)
std r29,104($sp)
std r30,112($sp)
std r31,120($sp)
mr $bp,$ap
li $acc0,0
li $acc1,0
li $acc2,0
li $acc3,0
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
bl __ecp_nistz256_sub_from
mtlr r0
ld r28,96($sp)
ld r29,104($sp)
ld r30,112($sp)
ld r31,120($sp)
addi $sp,$sp,128
blr
.long 0
.byte 0,12,4,0,0x80,4,2,0
.long 0
.size ecp_nistz256_neg,.-ecp_nistz256_neg
# note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded
# to $a0-$a3 and b[0] - to $bi
.type __ecp_nistz256_mul_mont,\@function
.align 4
__ecp_nistz256_mul_mont:
mulld $acc0,$a0,$bi # a[0]*b[0]
mulhdu $t0,$a0,$bi
mulld $acc1,$a1,$bi # a[1]*b[0]
mulhdu $t1,$a1,$bi
mulld $acc2,$a2,$bi # a[2]*b[0]
mulhdu $t2,$a2,$bi
mulld $acc3,$a3,$bi # a[3]*b[0]
mulhdu $t3,$a3,$bi
ld $bi,8($bp) # b[1]
addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
sldi $t0,$acc0,32
adde $acc2,$acc2,$t1
srdi $t1,$acc0,32
adde $acc3,$acc3,$t2
addze $acc4,$t3
li $acc5,0
___
for($i=1;$i<4;$i++) {
################################################################
# Reduction iteration is normally performed by accumulating
# result of multiplication of modulus by "magic" digit [and
# omitting least significant word, which is guaranteed to
# be 0], but thanks to special form of modulus and "magic"
# digit being equal to least significant word, it can be
# performed with additions and subtractions alone. Indeed:
#
# ffff0001.00000000.0000ffff.ffffffff
# * abcdefgh
# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
#
# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
# rewrite above as:
#
# xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
# + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
# - 0000abcd.efgh0000.00000000.00000000.abcdefgh
#
# or marking redundant operations:
#
# xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
# + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
# - 0000abcd.efgh0000.--------.--------.--------
$code.=<<___;
subfc $t2,$t0,$acc0 # "*0xffff0001"
subfe $t3,$t1,$acc0
addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
adde $acc1,$acc2,$t1
adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
adde $acc3,$acc4,$t3
addze $acc4,$acc5
mulld $t0,$a0,$bi # lo(a[0]*b[i])
mulld $t1,$a1,$bi # lo(a[1]*b[i])
mulld $t2,$a2,$bi # lo(a[2]*b[i])
mulld $t3,$a3,$bi # lo(a[3]*b[i])
addc $acc0,$acc0,$t0 # accumulate low parts of multiplication
mulhdu $t0,$a0,$bi # hi(a[0]*b[i])
adde $acc1,$acc1,$t1
mulhdu $t1,$a1,$bi # hi(a[1]*b[i])
adde $acc2,$acc2,$t2
mulhdu $t2,$a2,$bi # hi(a[2]*b[i])
adde $acc3,$acc3,$t3
mulhdu $t3,$a3,$bi # hi(a[3]*b[i])
addze $acc4,$acc4
___
$code.=<<___ if ($i<3);
ld $bi,8*($i+1)($bp) # b[$i+1]
___
$code.=<<___;
addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
sldi $t0,$acc0,32
adde $acc2,$acc2,$t1
srdi $t1,$acc0,32
adde $acc3,$acc3,$t2
adde $acc4,$acc4,$t3
li $acc5,0
addze $acc5,$acc5
___
}
$code.=<<___;
# last reduction
subfc $t2,$t0,$acc0 # "*0xffff0001"
subfe $t3,$t1,$acc0
addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
adde $acc1,$acc2,$t1
adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
adde $acc3,$acc4,$t3
addze $acc4,$acc5
li $t2,0
addic $acc0,$acc0,1 # ret -= modulus
subfe $acc1,$poly1,$acc1
subfe $acc2,$t2,$acc2
subfe $acc3,$poly3,$acc3
subfe $acc4,$t2,$acc4
addc $acc0,$acc0,$acc4 # ret += modulus if borrow
and $t1,$poly1,$acc4
and $t3,$poly3,$acc4
adde $acc1,$acc1,$t1
addze $acc2,$acc2
adde $acc3,$acc3,$t3
std $acc0,0($rp)
std $acc1,8($rp)
std $acc2,16($rp)
std $acc3,24($rp)
blr
.long 0
.byte 0,12,0x14,0,0,0,1,0
.long 0
.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
# note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded
# to $a0-$a3
.type __ecp_nistz256_sqr_mont,\@function
.align 4
__ecp_nistz256_sqr_mont:
################################################################
# | | | | | |a1*a0| |
# | | | | |a2*a0| | |
# | |a3*a2|a3*a0| | | |
# | | | |a2*a1| | | |
# | | |a3*a1| | | | |
# *| | | | | | | | 2|
# +|a3*a3|a2*a2|a1*a1|a0*a0|
# |--+--+--+--+--+--+--+--|
# |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
#
# "can't overflow" below mark carrying into high part of
# multiplication result, which can't overflow, because it
# can never be all ones.
mulld $acc1,$a1,$a0 # a[1]*a[0]
mulhdu $t1,$a1,$a0
mulld $acc2,$a2,$a0 # a[2]*a[0]
mulhdu $t2,$a2,$a0
mulld $acc3,$a3,$a0 # a[3]*a[0]
mulhdu $acc4,$a3,$a0
addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
mulld $t0,$a2,$a1 # a[2]*a[1]
mulhdu $t1,$a2,$a1
adde $acc3,$acc3,$t2
mulld $t2,$a3,$a1 # a[3]*a[1]
mulhdu $t3,$a3,$a1
addze $acc4,$acc4 # can't overflow
mulld $acc5,$a3,$a2 # a[3]*a[2]
mulhdu $acc6,$a3,$a2
addc $t1,$t1,$t2 # accumulate high parts of multiplication
addze $t2,$t3 # can't overflow
addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
adde $acc4,$acc4,$t1
adde $acc5,$acc5,$t2
addze $acc6,$acc6 # can't overflow
addc $acc1,$acc1,$acc1 # acc[1-6]*=2
adde $acc2,$acc2,$acc2
adde $acc3,$acc3,$acc3
adde $acc4,$acc4,$acc4
adde $acc5,$acc5,$acc5
adde $acc6,$acc6,$acc6
li $acc7,0
addze $acc7,$acc7
mulld $acc0,$a0,$a0 # a[0]*a[0]
mulhdu $a0,$a0,$a0
mulld $t1,$a1,$a1 # a[1]*a[1]
mulhdu $a1,$a1,$a1
mulld $t2,$a2,$a2 # a[2]*a[2]
mulhdu $a2,$a2,$a2
mulld $t3,$a3,$a3 # a[3]*a[3]
mulhdu $a3,$a3,$a3
addc $acc1,$acc1,$a0 # +a[i]*a[i]
sldi $t0,$acc0,32
adde $acc2,$acc2,$t1
srdi $t1,$acc0,32
adde $acc3,$acc3,$a1
adde $acc4,$acc4,$t2
adde $acc5,$acc5,$a2
adde $acc6,$acc6,$t3
adde $acc7,$acc7,$a3
___
for($i=0;$i<3;$i++) { # reductions, see commentary in
# multiplication for details
$code.=<<___;
subfc $t2,$t0,$acc0 # "*0xffff0001"
subfe $t3,$t1,$acc0
addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
sldi $t0,$acc0,32
adde $acc1,$acc2,$t1
srdi $t1,$acc0,32
adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
addze $acc3,$t3 # can't overflow
___
}
$code.=<<___;
subfc $t2,$t0,$acc0 # "*0xffff0001"
subfe $t3,$t1,$acc0
addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0]
adde $acc1,$acc2,$t1
adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001
addze $acc3,$t3 # can't overflow
addc $acc0,$acc0,$acc4 # accumulate upper half
adde $acc1,$acc1,$acc5
adde $acc2,$acc2,$acc6
adde $acc3,$acc3,$acc7
li $t2,0
addze $acc4,$t2
addic $acc0,$acc0,1 # ret -= modulus
subfe $acc1,$poly1,$acc1
subfe $acc2,$t2,$acc2
subfe $acc3,$poly3,$acc3
subfe $acc4,$t2,$acc4
addc $acc0,$acc0,$acc4 # ret += modulus if borrow
and $t1,$poly1,$acc4
and $t3,$poly3,$acc4
adde $acc1,$acc1,$t1
addze $acc2,$acc2
adde $acc3,$acc3,$t3
std $acc0,0($rp)
std $acc1,8($rp)
std $acc2,16($rp)
std $acc3,24($rp)
blr
.long 0
.byte 0,12,0x14,0,0,0,1,0
.long 0
.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont
# Note that __ecp_nistz256_add expects both input vectors pre-loaded to
# $a0-$a3 and $t0-$t3. This is done because it's used in multiple
# contexts, e.g. in multiplication by 2 and 3...
.type __ecp_nistz256_add,\@function
.align 4
__ecp_nistz256_add:
addc $acc0,$acc0,$t0 # ret = a+b
adde $acc1,$acc1,$t1
adde $acc2,$acc2,$t2
li $t2,0
adde $acc3,$acc3,$t3
addze $t0,$t2
# if a+b >= modulus, subtract modulus
#
# But since comparison implies subtraction, we subtract
# modulus and then add it back if subraction borrowed.
subic $acc0,$acc0,-1
subfe $acc1,$poly1,$acc1
subfe $acc2,$t2,$acc2
subfe $acc3,$poly3,$acc3
subfe $t0,$t2,$t0
addc $acc0,$acc0,$t0
and $t1,$poly1,$t0
and $t3,$poly3,$t0
adde $acc1,$acc1,$t1
addze $acc2,$acc2
adde $acc3,$acc3,$t3
std $acc0,0($rp)
std $acc1,8($rp)
std $acc2,16($rp)
std $acc3,24($rp)
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size __ecp_nistz256_add,.-__ecp_nistz256_add
.type __ecp_nistz256_sub_from,\@function
.align 4
__ecp_nistz256_sub_from:
ld $t0,0($bp)
ld $t1,8($bp)
ld $t2,16($bp)
ld $t3,24($bp)
subfc $acc0,$t0,$acc0 # ret = a-b
subfe $acc1,$t1,$acc1
subfe $acc2,$t2,$acc2
subfe $acc3,$t3,$acc3
subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
# if a-b borrowed, add modulus
addc $acc0,$acc0,$t0 # ret -= modulus & t0
and $t1,$poly1,$t0
and $t3,$poly3,$t0
adde $acc1,$acc1,$t1
addze $acc2,$acc2
adde $acc3,$acc3,$t3
std $acc0,0($rp)
std $acc1,8($rp)
std $acc2,16($rp)
std $acc3,24($rp)
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
.type __ecp_nistz256_sub_morf,\@function
.align 4
__ecp_nistz256_sub_morf:
ld $t0,0($bp)
ld $t1,8($bp)
ld $t2,16($bp)
ld $t3,24($bp)
subfc $acc0,$acc0,$t0 # ret = b-a
subfe $acc1,$acc1,$t1
subfe $acc2,$acc2,$t2
subfe $acc3,$acc3,$t3
subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0
# if b-a borrowed, add modulus
addc $acc0,$acc0,$t0 # ret -= modulus & t0
and $t1,$poly1,$t0
and $t3,$poly3,$t0
adde $acc1,$acc1,$t1
addze $acc2,$acc2
adde $acc3,$acc3,$t3
std $acc0,0($rp)
std $acc1,8($rp)
std $acc2,16($rp)
std $acc3,24($rp)
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
.type __ecp_nistz256_div_by_2,\@function
.align 4
__ecp_nistz256_div_by_2:
andi. $t0,$acc0,1
addic $acc0,$acc0,-1 # a += modulus
neg $t0,$t0
adde $acc1,$acc1,$poly1
not $t0,$t0
addze $acc2,$acc2
li $t2,0
adde $acc3,$acc3,$poly3
and $t1,$poly1,$t0
addze $ap,$t2 # ap = carry
and $t3,$poly3,$t0
subfc $acc0,$t0,$acc0 # a -= modulus if a was even
subfe $acc1,$t1,$acc1
subfe $acc2,$t2,$acc2
subfe $acc3,$t3,$acc3
subfe $ap, $t2,$ap
srdi $acc0,$acc0,1
sldi $t0,$acc1,63
srdi $acc1,$acc1,1
sldi $t1,$acc2,63
srdi $acc2,$acc2,1
sldi $t2,$acc3,63
srdi $acc3,$acc3,1
sldi $t3,$ap,63
or $acc0,$acc0,$t0
or $acc1,$acc1,$t1
or $acc2,$acc2,$t2
or $acc3,$acc3,$t3
std $acc0,0($rp)
std $acc1,8($rp)
std $acc2,16($rp)
std $acc3,24($rp)
blr
.long 0
.byte 0,12,0x14,0,0,0,1,0
.long 0
.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
___
########################################################################
# following subroutines are "literal" implementation of those found in
# ecp_nistz256.c
#
########################################################################
# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
#
if (1) {
my $FRAME=64+32*4+12*8;
my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3));
# above map() describes stack layout with 4 temporary
# 256-bit vectors on top.
my ($rp_real,$ap_real) = map("r$_",(20,21));
$code.=<<___;
.globl ecp_nistz256_point_double
.align 5
ecp_nistz256_point_double:
stdu $sp,-$FRAME($sp)
mflr r0
std r20,$FRAME-8*12($sp)
std r21,$FRAME-8*11($sp)
std r22,$FRAME-8*10($sp)
std r23,$FRAME-8*9($sp)
std r24,$FRAME-8*8($sp)
std r25,$FRAME-8*7($sp)
std r26,$FRAME-8*6($sp)
std r27,$FRAME-8*5($sp)
std r28,$FRAME-8*4($sp)
std r29,$FRAME-8*3($sp)
std r30,$FRAME-8*2($sp)
std r31,$FRAME-8*1($sp)
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
.Ldouble_shortcut:
ld $acc0,32($ap)
ld $acc1,40($ap)
ld $acc2,48($ap)
ld $acc3,56($ap)
mr $t0,$acc0
mr $t1,$acc1
mr $t2,$acc2
mr $t3,$acc3
ld $a0,64($ap) # forward load for p256_sqr_mont
ld $a1,72($ap)
ld $a2,80($ap)
ld $a3,88($ap)
mr $rp_real,$rp
mr $ap_real,$ap
addi $rp,$sp,$S
bl __ecp_nistz256_add # p256_mul_by_2(S, in_y);
addi $rp,$sp,$Zsqr
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z);
ld $t0,0($ap_real)
ld $t1,8($ap_real)
ld $t2,16($ap_real)
ld $t3,24($ap_real)
mr $a0,$acc0 # put Zsqr aside for p256_sub
mr $a1,$acc1
mr $a2,$acc2
mr $a3,$acc3
addi $rp,$sp,$M
bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x);
addi $bp,$ap_real,0
mr $acc0,$a0 # restore Zsqr
mr $acc1,$a1
mr $acc2,$a2
mr $acc3,$a3
ld $a0,$S+0($sp) # forward load for p256_sqr_mont
ld $a1,$S+8($sp)
ld $a2,$S+16($sp)
ld $a3,$S+24($sp)
addi $rp,$sp,$Zsqr
bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr);
addi $rp,$sp,$S
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S);
ld $bi,32($ap_real)
ld $a0,64($ap_real)
ld $a1,72($ap_real)
ld $a2,80($ap_real)
ld $a3,88($ap_real)
addi $bp,$ap_real,32
addi $rp,$sp,$tmp0
bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y);
mr $t0,$acc0
mr $t1,$acc1
mr $t2,$acc2
mr $t3,$acc3
ld $a0,$S+0($sp) # forward load for p256_sqr_mont
ld $a1,$S+8($sp)
ld $a2,$S+16($sp)
ld $a3,$S+24($sp)
addi $rp,$rp_real,64
bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0);
addi $rp,$sp,$tmp0
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S);
ld $bi,$Zsqr($sp) # forward load for p256_mul_mont
ld $a0,$M+0($sp)
ld $a1,$M+8($sp)
ld $a2,$M+16($sp)
ld $a3,$M+24($sp)
addi $rp,$rp_real,32
bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0);
addi $bp,$sp,$Zsqr
addi $rp,$sp,$M
bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr);
mr $t0,$acc0 # duplicate M
mr $t1,$acc1
mr $t2,$acc2
mr $t3,$acc3
mr $a0,$acc0 # put M aside
mr $a1,$acc1
mr $a2,$acc2
mr $a3,$acc3
addi $rp,$sp,$M
bl __ecp_nistz256_add
mr $t0,$a0 # restore M
mr $t1,$a1
mr $t2,$a2
mr $t3,$a3
ld $bi,0($ap_real) # forward load for p256_mul_mont
ld $a0,$S+0($sp)
ld $a1,$S+8($sp)
ld $a2,$S+16($sp)
ld $a3,$S+24($sp)
bl __ecp_nistz256_add # p256_mul_by_3(M, M);
addi $bp,$ap_real,0
addi $rp,$sp,$S
bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x);
mr $t0,$acc0
mr $t1,$acc1
mr $t2,$acc2
mr $t3,$acc3
ld $a0,$M+0($sp) # forward load for p256_sqr_mont
ld $a1,$M+8($sp)
ld $a2,$M+16($sp)
ld $a3,$M+24($sp)
addi $rp,$sp,$tmp0
bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S);
addi $rp,$rp_real,0
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M);
addi $bp,$sp,$tmp0
bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0);
addi $bp,$sp,$S
addi $rp,$sp,$S
bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x);
ld $bi,$M($sp)
mr $a0,$acc0 # copy S
mr $a1,$acc1
mr $a2,$acc2
mr $a3,$acc3
addi $bp,$sp,$M
bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M);
addi $bp,$rp_real,32
addi $rp,$rp_real,32
bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y);
mtlr r0
ld r20,$FRAME-8*12($sp)
ld r21,$FRAME-8*11($sp)
ld r22,$FRAME-8*10($sp)
ld r23,$FRAME-8*9($sp)
ld r24,$FRAME-8*8($sp)
ld r25,$FRAME-8*7($sp)
ld r26,$FRAME-8*6($sp)
ld r27,$FRAME-8*5($sp)
ld r28,$FRAME-8*4($sp)
ld r29,$FRAME-8*3($sp)
ld r30,$FRAME-8*2($sp)
ld r31,$FRAME-8*1($sp)
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,0,0x80,12,2,0
.long 0
.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
___
}
########################################################################
# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
# const P256_POINT *in2);
if (1) {
my $FRAME = 64 + 32*12 + 16*8;
my ($res_x,$res_y,$res_z,
$H,$Hsqr,$R,$Rsqr,$Hcub,
$U1,$U2,$S1,$S2)=map(64+32*$_,(0..11));
my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
# above map() describes stack layout with 12 temporary
# 256-bit vectors on top.
my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
$code.=<<___;
.globl ecp_nistz256_point_add
.align 5
ecp_nistz256_point_add:
stdu $sp,-$FRAME($sp)
mflr r0
std r16,$FRAME-8*16($sp)
std r17,$FRAME-8*15($sp)
std r18,$FRAME-8*14($sp)
std r19,$FRAME-8*13($sp)
std r20,$FRAME-8*12($sp)
std r21,$FRAME-8*11($sp)
std r22,$FRAME-8*10($sp)
std r23,$FRAME-8*9($sp)
std r24,$FRAME-8*8($sp)
std r25,$FRAME-8*7($sp)
std r26,$FRAME-8*6($sp)
std r27,$FRAME-8*5($sp)
std r28,$FRAME-8*4($sp)
std r29,$FRAME-8*3($sp)
std r30,$FRAME-8*2($sp)
std r31,$FRAME-8*1($sp)
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
ld $a0,64($bp) # in2_z
ld $a1,72($bp)
ld $a2,80($bp)
ld $a3,88($bp)
mr $rp_real,$rp
mr $ap_real,$ap
mr $bp_real,$bp
or $t0,$a0,$a1
or $t2,$a2,$a3
or $in2infty,$t0,$t2
neg $t0,$in2infty
or $in2infty,$in2infty,$t0
sradi $in2infty,$in2infty,63 # !in2infty
addi $rp,$sp,$Z2sqr
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z);
ld $a0,64($ap_real) # in1_z
ld $a1,72($ap_real)
ld $a2,80($ap_real)
ld $a3,88($ap_real)
or $t0,$a0,$a1
or $t2,$a2,$a3
or $in1infty,$t0,$t2
neg $t0,$in1infty
or $in1infty,$in1infty,$t0
sradi $in1infty,$in1infty,63 # !in1infty
addi $rp,$sp,$Z1sqr
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
ld $bi,64($bp_real)
ld $a0,$Z2sqr+0($sp)
ld $a1,$Z2sqr+8($sp)
ld $a2,$Z2sqr+16($sp)
ld $a3,$Z2sqr+24($sp)
addi $bp,$bp_real,64
addi $rp,$sp,$S1
bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z);
ld $bi,64($ap_real)
ld $a0,$Z1sqr+0($sp)
ld $a1,$Z1sqr+8($sp)
ld $a2,$Z1sqr+16($sp)
ld $a3,$Z1sqr+24($sp)
addi $bp,$ap_real,64
addi $rp,$sp,$S2
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
ld $bi,32($ap_real)
ld $a0,$S1+0($sp)
ld $a1,$S1+8($sp)
ld $a2,$S1+16($sp)
ld $a3,$S1+24($sp)
addi $bp,$ap_real,32
addi $rp,$sp,$S1
bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y);
ld $bi,32($bp_real)
ld $a0,$S2+0($sp)
ld $a1,$S2+8($sp)
ld $a2,$S2+16($sp)
ld $a3,$S2+24($sp)
addi $bp,$bp_real,32
addi $rp,$sp,$S2
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
addi $bp,$sp,$S1
ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont
ld $a0,0($ap_real)
ld $a1,8($ap_real)
ld $a2,16($ap_real)
ld $a3,24($ap_real)
addi $rp,$sp,$R
bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1);
or $acc0,$acc0,$acc1 # see if result is zero
or $acc2,$acc2,$acc3
or $temp,$acc0,$acc2
addi $bp,$sp,$Z2sqr
addi $rp,$sp,$U1
bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr);
ld $bi,$Z1sqr($sp)
ld $a0,0($bp_real)
ld $a1,8($bp_real)
ld $a2,16($bp_real)
ld $a3,24($bp_real)
addi $bp,$sp,$Z1sqr
addi $rp,$sp,$U2
bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr);
addi $bp,$sp,$U1
ld $a0,$R+0($sp) # forward load for p256_sqr_mont
ld $a1,$R+8($sp)
ld $a2,$R+16($sp)
ld $a3,$R+24($sp)
addi $rp,$sp,$H
bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1);
or $acc0,$acc0,$acc1 # see if result is zero
or $acc2,$acc2,$acc3
or. $acc0,$acc0,$acc2
bne .Ladd_proceed # is_equal(U1,U2)?
and. $t0,$in1infty,$in2infty
beq .Ladd_proceed # (in1infty || in2infty)?
cmpldi $temp,0
beq .Ladd_double # is_equal(S1,S2)?
xor $a0,$a0,$a0
std $a0,0($rp_real)
std $a0,8($rp_real)
std $a0,16($rp_real)
std $a0,24($rp_real)
std $a0,32($rp_real)
std $a0,40($rp_real)
std $a0,48($rp_real)
std $a0,56($rp_real)
std $a0,64($rp_real)
std $a0,72($rp_real)
std $a0,80($rp_real)
std $a0,88($rp_real)
b .Ladd_done
.align 4
.Ladd_double:
ld $bp,0($sp) # back-link
mr $ap,$ap_real
mr $rp,$rp_real
ld r16,$FRAME-8*16($sp)
ld r17,$FRAME-8*15($sp)
ld r18,$FRAME-8*14($sp)
ld r19,$FRAME-8*13($sp)
stdu $bp,$FRAME-288($sp) # difference in stack frame sizes
b .Ldouble_shortcut
.align 4
.Ladd_proceed:
addi $rp,$sp,$Rsqr
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
ld $bi,64($ap_real)
ld $a0,$H+0($sp)
ld $a1,$H+8($sp)
ld $a2,$H+16($sp)
ld $a3,$H+24($sp)
addi $bp,$ap_real,64
addi $rp,$sp,$res_z
bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
ld $a0,$H+0($sp)
ld $a1,$H+8($sp)
ld $a2,$H+16($sp)
ld $a3,$H+24($sp)
addi $rp,$sp,$Hsqr
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
ld $bi,64($bp_real)
ld $a0,$res_z+0($sp)
ld $a1,$res_z+8($sp)
ld $a2,$res_z+16($sp)
ld $a3,$res_z+24($sp)
addi $bp,$bp_real,64
addi $rp,$sp,$res_z
bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z);
ld $bi,$H($sp)
ld $a0,$Hsqr+0($sp)
ld $a1,$Hsqr+8($sp)
ld $a2,$Hsqr+16($sp)
ld $a3,$Hsqr+24($sp)
addi $bp,$sp,$H
addi $rp,$sp,$Hcub
bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
ld $bi,$Hsqr($sp)
ld $a0,$U1+0($sp)
ld $a1,$U1+8($sp)
ld $a2,$U1+16($sp)
ld $a3,$U1+24($sp)
addi $bp,$sp,$Hsqr
addi $rp,$sp,$U2
bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr);
mr $t0,$acc0
mr $t1,$acc1
mr $t2,$acc2
mr $t3,$acc3
addi $rp,$sp,$Hsqr
bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
addi $bp,$sp,$Rsqr
addi $rp,$sp,$res_x
bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
addi $bp,$sp,$Hcub
bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
addi $bp,$sp,$U2
ld $bi,$Hcub($sp) # forward load for p256_mul_mont
ld $a0,$S1+0($sp)
ld $a1,$S1+8($sp)
ld $a2,$S1+16($sp)
ld $a3,$S1+24($sp)
addi $rp,$sp,$res_y
bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
addi $bp,$sp,$Hcub
addi $rp,$sp,$S2
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub);
ld $bi,$R($sp)
ld $a0,$res_y+0($sp)
ld $a1,$res_y+8($sp)
ld $a2,$res_y+16($sp)
ld $a3,$res_y+24($sp)
addi $bp,$sp,$R
addi $rp,$sp,$res_y
bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
addi $bp,$sp,$S2
bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
ld $t0,0($bp_real) # in2
ld $t1,8($bp_real)
ld $t2,16($bp_real)
ld $t3,24($bp_real)
ld $a0,$res_x+0($sp) # res
ld $a1,$res_x+8($sp)
ld $a2,$res_x+16($sp)
ld $a3,$res_x+24($sp)
___
for($i=0;$i<64;$i+=32) { # conditional moves
$code.=<<___;
ld $acc0,$i+0($ap_real) # in1
ld $acc1,$i+8($ap_real)
ld $acc2,$i+16($ap_real)
ld $acc3,$i+24($ap_real)
andc $t0,$t0,$in1infty
andc $t1,$t1,$in1infty
andc $t2,$t2,$in1infty
andc $t3,$t3,$in1infty
and $a0,$a0,$in1infty
and $a1,$a1,$in1infty
and $a2,$a2,$in1infty
and $a3,$a3,$in1infty
or $t0,$t0,$a0
or $t1,$t1,$a1
or $t2,$t2,$a2
or $t3,$t3,$a3
andc $acc0,$acc0,$in2infty
andc $acc1,$acc1,$in2infty
andc $acc2,$acc2,$in2infty
andc $acc3,$acc3,$in2infty
and $t0,$t0,$in2infty
and $t1,$t1,$in2infty
and $t2,$t2,$in2infty
and $t3,$t3,$in2infty
or $acc0,$acc0,$t0
or $acc1,$acc1,$t1
or $acc2,$acc2,$t2
or $acc3,$acc3,$t3
ld $t0,$i+32($bp_real) # in2
ld $t1,$i+40($bp_real)
ld $t2,$i+48($bp_real)
ld $t3,$i+56($bp_real)
ld $a0,$res_x+$i+32($sp)
ld $a1,$res_x+$i+40($sp)
ld $a2,$res_x+$i+48($sp)
ld $a3,$res_x+$i+56($sp)
std $acc0,$i+0($rp_real)
std $acc1,$i+8($rp_real)
std $acc2,$i+16($rp_real)
std $acc3,$i+24($rp_real)
___
}
$code.=<<___;
ld $acc0,$i+0($ap_real) # in1
ld $acc1,$i+8($ap_real)
ld $acc2,$i+16($ap_real)
ld $acc3,$i+24($ap_real)
andc $t0,$t0,$in1infty
andc $t1,$t1,$in1infty
andc $t2,$t2,$in1infty
andc $t3,$t3,$in1infty
and $a0,$a0,$in1infty
and $a1,$a1,$in1infty
and $a2,$a2,$in1infty
and $a3,$a3,$in1infty
or $t0,$t0,$a0
or $t1,$t1,$a1
or $t2,$t2,$a2
or $t3,$t3,$a3
andc $acc0,$acc0,$in2infty
andc $acc1,$acc1,$in2infty
andc $acc2,$acc2,$in2infty
andc $acc3,$acc3,$in2infty
and $t0,$t0,$in2infty
and $t1,$t1,$in2infty
and $t2,$t2,$in2infty
and $t3,$t3,$in2infty
or $acc0,$acc0,$t0
or $acc1,$acc1,$t1
or $acc2,$acc2,$t2
or $acc3,$acc3,$t3
std $acc0,$i+0($rp_real)
std $acc1,$i+8($rp_real)
std $acc2,$i+16($rp_real)
std $acc3,$i+24($rp_real)
.Ladd_done:
mtlr r0
ld r16,$FRAME-8*16($sp)
ld r17,$FRAME-8*15($sp)
ld r18,$FRAME-8*14($sp)
ld r19,$FRAME-8*13($sp)
ld r20,$FRAME-8*12($sp)
ld r21,$FRAME-8*11($sp)
ld r22,$FRAME-8*10($sp)
ld r23,$FRAME-8*9($sp)
ld r24,$FRAME-8*8($sp)
ld r25,$FRAME-8*7($sp)
ld r26,$FRAME-8*6($sp)
ld r27,$FRAME-8*5($sp)
ld r28,$FRAME-8*4($sp)
ld r29,$FRAME-8*3($sp)
ld r30,$FRAME-8*2($sp)
ld r31,$FRAME-8*1($sp)
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,0,0x80,16,3,0
.long 0
.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
___
}
########################################################################
# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
# const P256_POINT_AFFINE *in2);
if (1) {
my $FRAME = 64 + 32*10 + 16*8;
my ($res_x,$res_y,$res_z,
$U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9));
my $Z1sqr = $S2;
# above map() describes stack layout with 10 temporary
# 256-bit vectors on top.
my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21));
$code.=<<___;
.globl ecp_nistz256_point_add_affine
.align 5
ecp_nistz256_point_add_affine:
stdu $sp,-$FRAME($sp)
mflr r0
std r16,$FRAME-8*16($sp)
std r17,$FRAME-8*15($sp)
std r18,$FRAME-8*14($sp)
std r19,$FRAME-8*13($sp)
std r20,$FRAME-8*12($sp)
std r21,$FRAME-8*11($sp)
std r22,$FRAME-8*10($sp)
std r23,$FRAME-8*9($sp)
std r24,$FRAME-8*8($sp)
std r25,$FRAME-8*7($sp)
std r26,$FRAME-8*6($sp)
std r27,$FRAME-8*5($sp)
std r28,$FRAME-8*4($sp)
std r29,$FRAME-8*3($sp)
std r30,$FRAME-8*2($sp)
std r31,$FRAME-8*1($sp)
li $poly1,-1
srdi $poly1,$poly1,32 # 0x00000000ffffffff
li $poly3,1
orc $poly3,$poly3,$poly1 # 0xffffffff00000001
mr $rp_real,$rp
mr $ap_real,$ap
mr $bp_real,$bp
ld $a0,64($ap) # in1_z
ld $a1,72($ap)
ld $a2,80($ap)
ld $a3,88($ap)
or $t0,$a0,$a1
or $t2,$a2,$a3
or $in1infty,$t0,$t2
neg $t0,$in1infty
or $in1infty,$in1infty,$t0
sradi $in1infty,$in1infty,63 # !in1infty
ld $acc0,0($bp) # in2_x
ld $acc1,8($bp)
ld $acc2,16($bp)
ld $acc3,24($bp)
ld $t0,32($bp) # in2_y
ld $t1,40($bp)
ld $t2,48($bp)
ld $t3,56($bp)
or $acc0,$acc0,$acc1
or $acc2,$acc2,$acc3
or $acc0,$acc0,$acc2
or $t0,$t0,$t1
or $t2,$t2,$t3
or $t0,$t0,$t2
or $in2infty,$acc0,$t0
neg $t0,$in2infty
or $in2infty,$in2infty,$t0
sradi $in2infty,$in2infty,63 # !in2infty
addi $rp,$sp,$Z1sqr
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z);
mr $a0,$acc0
mr $a1,$acc1
mr $a2,$acc2
mr $a3,$acc3
ld $bi,0($bp_real)
addi $bp,$bp_real,0
addi $rp,$sp,$U2
bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x);
addi $bp,$ap_real,0
ld $bi,64($ap_real) # forward load for p256_mul_mont
ld $a0,$Z1sqr+0($sp)
ld $a1,$Z1sqr+8($sp)
ld $a2,$Z1sqr+16($sp)
ld $a3,$Z1sqr+24($sp)
addi $rp,$sp,$H
bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x);
addi $bp,$ap_real,64
addi $rp,$sp,$S2
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z);
ld $bi,64($ap_real)
ld $a0,$H+0($sp)
ld $a1,$H+8($sp)
ld $a2,$H+16($sp)
ld $a3,$H+24($sp)
addi $bp,$ap_real,64
addi $rp,$sp,$res_z
bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z);
ld $bi,32($bp_real)
ld $a0,$S2+0($sp)
ld $a1,$S2+8($sp)
ld $a2,$S2+16($sp)
ld $a3,$S2+24($sp)
addi $bp,$bp_real,32
addi $rp,$sp,$S2
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y);
addi $bp,$ap_real,32
ld $a0,$H+0($sp) # forward load for p256_sqr_mont
ld $a1,$H+8($sp)
ld $a2,$H+16($sp)
ld $a3,$H+24($sp)
addi $rp,$sp,$R
bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y);
addi $rp,$sp,$Hsqr
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H);
ld $a0,$R+0($sp)
ld $a1,$R+8($sp)
ld $a2,$R+16($sp)
ld $a3,$R+24($sp)
addi $rp,$sp,$Rsqr
bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R);
ld $bi,$H($sp)
ld $a0,$Hsqr+0($sp)
ld $a1,$Hsqr+8($sp)
ld $a2,$Hsqr+16($sp)
ld $a3,$Hsqr+24($sp)
addi $bp,$sp,$H
addi $rp,$sp,$Hcub
bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H);
ld $bi,0($ap_real)
ld $a0,$Hsqr+0($sp)
ld $a1,$Hsqr+8($sp)
ld $a2,$Hsqr+16($sp)
ld $a3,$Hsqr+24($sp)
addi $bp,$ap_real,0
addi $rp,$sp,$U2
bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr);
mr $t0,$acc0
mr $t1,$acc1
mr $t2,$acc2
mr $t3,$acc3
addi $rp,$sp,$Hsqr
bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2);
addi $bp,$sp,$Rsqr
addi $rp,$sp,$res_x
bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr);
addi $bp,$sp,$Hcub
bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub);
addi $bp,$sp,$U2
ld $bi,32($ap_real) # forward load for p256_mul_mont
ld $a0,$Hcub+0($sp)
ld $a1,$Hcub+8($sp)
ld $a2,$Hcub+16($sp)
ld $a3,$Hcub+24($sp)
addi $rp,$sp,$res_y
bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x);
addi $bp,$ap_real,32
addi $rp,$sp,$S2
bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub);
ld $bi,$R($sp)
ld $a0,$res_y+0($sp)
ld $a1,$res_y+8($sp)
ld $a2,$res_y+16($sp)
ld $a3,$res_y+24($sp)
addi $bp,$sp,$R
addi $rp,$sp,$res_y
bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R);
addi $bp,$sp,$S2
bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2);
ld $t0,0($bp_real) # in2
ld $t1,8($bp_real)
ld $t2,16($bp_real)
ld $t3,24($bp_real)
ld $a0,$res_x+0($sp) # res
ld $a1,$res_x+8($sp)
ld $a2,$res_x+16($sp)
ld $a3,$res_x+24($sp)
___
for($i=0;$i<64;$i+=32) { # conditional moves
$code.=<<___;
ld $acc0,$i+0($ap_real) # in1
ld $acc1,$i+8($ap_real)
ld $acc2,$i+16($ap_real)
ld $acc3,$i+24($ap_real)
andc $t0,$t0,$in1infty
andc $t1,$t1,$in1infty
andc $t2,$t2,$in1infty
andc $t3,$t3,$in1infty
and $a0,$a0,$in1infty
and $a1,$a1,$in1infty
and $a2,$a2,$in1infty
and $a3,$a3,$in1infty
or $t0,$t0,$a0
or $t1,$t1,$a1
or $t2,$t2,$a2
or $t3,$t3,$a3
andc $acc0,$acc0,$in2infty
andc $acc1,$acc1,$in2infty
andc $acc2,$acc2,$in2infty
andc $acc3,$acc3,$in2infty
and $t0,$t0,$in2infty
and $t1,$t1,$in2infty
and $t2,$t2,$in2infty
and $t3,$t3,$in2infty
or $acc0,$acc0,$t0
or $acc1,$acc1,$t1
or $acc2,$acc2,$t2
or $acc3,$acc3,$t3
___
$code.=<<___ if ($i==0);
ld $t0,32($bp_real) # in2
ld $t1,40($bp_real)
ld $t2,48($bp_real)
ld $t3,56($bp_real)
___
$code.=<<___ if ($i==32);
li $t0,1 # Lone_mont
not $t1,$poly1
li $t2,-1
not $t3,$poly3
___
$code.=<<___;
ld $a0,$res_x+$i+32($sp)
ld $a1,$res_x+$i+40($sp)
ld $a2,$res_x+$i+48($sp)
ld $a3,$res_x+$i+56($sp)
std $acc0,$i+0($rp_real)
std $acc1,$i+8($rp_real)
std $acc2,$i+16($rp_real)
std $acc3,$i+24($rp_real)
___
}
$code.=<<___;
ld $acc0,$i+0($ap_real) # in1
ld $acc1,$i+8($ap_real)
ld $acc2,$i+16($ap_real)
ld $acc3,$i+24($ap_real)
andc $t0,$t0,$in1infty
andc $t1,$t1,$in1infty
andc $t2,$t2,$in1infty
andc $t3,$t3,$in1infty
and $a0,$a0,$in1infty
and $a1,$a1,$in1infty
and $a2,$a2,$in1infty
and $a3,$a3,$in1infty
or $t0,$t0,$a0
or $t1,$t1,$a1
or $t2,$t2,$a2
or $t3,$t3,$a3
andc $acc0,$acc0,$in2infty
andc $acc1,$acc1,$in2infty
andc $acc2,$acc2,$in2infty
andc $acc3,$acc3,$in2infty
and $t0,$t0,$in2infty
and $t1,$t1,$in2infty
and $t2,$t2,$in2infty
and $t3,$t3,$in2infty
or $acc0,$acc0,$t0
or $acc1,$acc1,$t1
or $acc2,$acc2,$t2
or $acc3,$acc3,$t3
std $acc0,$i+0($rp_real)
std $acc1,$i+8($rp_real)
std $acc2,$i+16($rp_real)
std $acc3,$i+24($rp_real)
mtlr r0
ld r16,$FRAME-8*16($sp)
ld r17,$FRAME-8*15($sp)
ld r18,$FRAME-8*14($sp)
ld r19,$FRAME-8*13($sp)
ld r20,$FRAME-8*12($sp)
ld r21,$FRAME-8*11($sp)
ld r22,$FRAME-8*10($sp)
ld r23,$FRAME-8*9($sp)
ld r24,$FRAME-8*8($sp)
ld r25,$FRAME-8*7($sp)
ld r26,$FRAME-8*6($sp)
ld r27,$FRAME-8*5($sp)
ld r28,$FRAME-8*4($sp)
ld r29,$FRAME-8*3($sp)
ld r30,$FRAME-8*2($sp)
ld r31,$FRAME-8*1($sp)
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,0,0x80,16,3,0
.long 0
.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
___
}
if (1) {
my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21));
my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0");
$code.=<<___;
########################################################################
# void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4],
# uint64_t b[4]);
.globl ecp_nistz256_ord_mul_mont
.align 5
ecp_nistz256_ord_mul_mont:
stdu $sp,-160($sp)
std r18,48($sp)
std r19,56($sp)
std r20,64($sp)
std r21,72($sp)
std r22,80($sp)
std r23,88($sp)
std r24,96($sp)
std r25,104($sp)
std r26,112($sp)
std r27,120($sp)
std r28,128($sp)
std r29,136($sp)
std r30,144($sp)
std r31,152($sp)
ld $a0,0($ap)
ld $bi,0($bp)
ld $a1,8($ap)
ld $a2,16($ap)
ld $a3,24($ap)
lis $ordk,0xccd1
lis $ord0,0xf3b9
lis $ord1,0xbce6
ori $ordk,$ordk,0xc8aa
ori $ord0,$ord0,0xcac2
ori $ord1,$ord1,0xfaad
sldi $ordk,$ordk,32
sldi $ord0,$ord0,32
sldi $ord1,$ord1,32
oris $ordk,$ordk,0xee00
oris $ord0,$ord0,0xfc63
oris $ord1,$ord1,0xa717
ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
li $ord2,-1 # 0xffffffffffffffff
sldi $ord3,$ord2,32 # 0xffffffff00000000
li $zr,0
mulld $acc0,$a0,$bi # a[0]*b[0]
mulhdu $t0,$a0,$bi
mulld $acc1,$a1,$bi # a[1]*b[0]
mulhdu $t1,$a1,$bi
mulld $acc2,$a2,$bi # a[2]*b[0]
mulhdu $t2,$a2,$bi
mulld $acc3,$a3,$bi # a[3]*b[0]
mulhdu $acc4,$a3,$bi
mulld $t4,$acc0,$ordk
addc $acc1,$acc1,$t0 # accumulate high parts of multiplication
adde $acc2,$acc2,$t1
adde $acc3,$acc3,$t2
addze $acc4,$acc4
li $acc5,0
___
for ($i=1;$i<4;$i++) {
################################################################
# ffff0000.ffffffff.yyyyyyyy.zzzzzzzz
# * abcdefgh
# + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
#
# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
# rewrite above as:
#
# xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx
# - 0000abcd.efgh0000.abcdefgh.00000000.00000000
# + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh
$code.=<<___;
ld $bi,8*$i($bp) # b[i]
sldi $t0,$t4,32
subfc $acc2,$t4,$acc2
srdi $t1,$t4,32
subfe $acc3,$t0,$acc3
subfe $acc4,$t1,$acc4
subfe $acc5,$zr,$acc5
addic $t0,$acc0,-1 # discarded
mulhdu $t1,$ord0,$t4
mulld $t2,$ord1,$t4
mulhdu $t3,$ord1,$t4
adde $t2,$t2,$t1
mulld $t0,$a0,$bi
addze $t3,$t3
mulld $t1,$a1,$bi
addc $acc0,$acc1,$t2
mulld $t2,$a2,$bi
adde $acc1,$acc2,$t3
mulld $t3,$a3,$bi
adde $acc2,$acc3,$t4
adde $acc3,$acc4,$t4
addze $acc4,$acc5
addc $acc0,$acc0,$t0 # accumulate low parts
mulhdu $t0,$a0,$bi
adde $acc1,$acc1,$t1
mulhdu $t1,$a1,$bi
adde $acc2,$acc2,$t2
mulhdu $t2,$a2,$bi
adde $acc3,$acc3,$t3
mulhdu $t3,$a3,$bi
addze $acc4,$acc4
mulld $t4,$acc0,$ordk
addc $acc1,$acc1,$t0 # accumulate high parts
adde $acc2,$acc2,$t1
adde $acc3,$acc3,$t2
adde $acc4,$acc4,$t3
addze $acc5,$zr
___
}
$code.=<<___;
sldi $t0,$t4,32 # last reduction
subfc $acc2,$t4,$acc2
srdi $t1,$t4,32
subfe $acc3,$t0,$acc3
subfe $acc4,$t1,$acc4
subfe $acc5,$zr,$acc5
addic $t0,$acc0,-1 # discarded
mulhdu $t1,$ord0,$t4
mulld $t2,$ord1,$t4
mulhdu $t3,$ord1,$t4
adde $t2,$t2,$t1
addze $t3,$t3
addc $acc0,$acc1,$t2
adde $acc1,$acc2,$t3
adde $acc2,$acc3,$t4
adde $acc3,$acc4,$t4
addze $acc4,$acc5
subfc $acc0,$ord0,$acc0 # ret -= modulus
subfe $acc1,$ord1,$acc1
subfe $acc2,$ord2,$acc2
subfe $acc3,$ord3,$acc3
subfe $acc4,$zr,$acc4
and $t0,$ord0,$acc4
and $t1,$ord1,$acc4
addc $acc0,$acc0,$t0 # ret += modulus if borrow
and $t3,$ord3,$acc4
adde $acc1,$acc1,$t1
adde $acc2,$acc2,$acc4
adde $acc3,$acc3,$t3
std $acc0,0($rp)
std $acc1,8($rp)
std $acc2,16($rp)
std $acc3,24($rp)
ld r18,48($sp)
ld r19,56($sp)
ld r20,64($sp)
ld r21,72($sp)
ld r22,80($sp)
ld r23,88($sp)
ld r24,96($sp)
ld r25,104($sp)
ld r26,112($sp)
ld r27,120($sp)
ld r28,128($sp)
ld r29,136($sp)
ld r30,144($sp)
ld r31,152($sp)
addi $sp,$sp,160
blr
.long 0
.byte 0,12,4,0,0x80,14,3,0
.long 0
.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
################################################################################
# void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4],
# int rep);
.globl ecp_nistz256_ord_sqr_mont
.align 5
ecp_nistz256_ord_sqr_mont:
stdu $sp,-160($sp)
std r18,48($sp)
std r19,56($sp)
std r20,64($sp)
std r21,72($sp)
std r22,80($sp)
std r23,88($sp)
std r24,96($sp)
std r25,104($sp)
std r26,112($sp)
std r27,120($sp)
std r28,128($sp)
std r29,136($sp)
std r30,144($sp)
std r31,152($sp)
mtctr $bp
ld $a0,0($ap)
ld $a1,8($ap)
ld $a2,16($ap)
ld $a3,24($ap)
lis $ordk,0xccd1
lis $ord0,0xf3b9
lis $ord1,0xbce6
ori $ordk,$ordk,0xc8aa
ori $ord0,$ord0,0xcac2
ori $ord1,$ord1,0xfaad
sldi $ordk,$ordk,32
sldi $ord0,$ord0,32
sldi $ord1,$ord1,32
oris $ordk,$ordk,0xee00
oris $ord0,$ord0,0xfc63
oris $ord1,$ord1,0xa717
ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f
ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551
ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84
li $ord2,-1 # 0xffffffffffffffff
sldi $ord3,$ord2,32 # 0xffffffff00000000
li $zr,0
b .Loop_ord_sqr
.align 5
.Loop_ord_sqr:
################################################################
# | | | | | |a1*a0| |
# | | | | |a2*a0| | |
# | |a3*a2|a3*a0| | | |
# | | | |a2*a1| | | |
# | | |a3*a1| | | | |
# *| | | | | | | | 2|
# +|a3*a3|a2*a2|a1*a1|a0*a0|
# |--+--+--+--+--+--+--+--|
# |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
#
# "can't overflow" below mark carrying into high part of
# multiplication result, which can't overflow, because it
# can never be all ones.
mulld $acc1,$a1,$a0 # a[1]*a[0]
mulhdu $t1,$a1,$a0
mulld $acc2,$a2,$a0 # a[2]*a[0]
mulhdu $t2,$a2,$a0
mulld $acc3,$a3,$a0 # a[3]*a[0]
mulhdu $acc4,$a3,$a0
addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
mulld $t0,$a2,$a1 # a[2]*a[1]
mulhdu $t1,$a2,$a1
adde $acc3,$acc3,$t2
mulld $t2,$a3,$a1 # a[3]*a[1]
mulhdu $t3,$a3,$a1
addze $acc4,$acc4 # can't overflow
mulld $acc5,$a3,$a2 # a[3]*a[2]
mulhdu $acc6,$a3,$a2
addc $t1,$t1,$t2 # accumulate high parts of multiplication
mulld $acc0,$a0,$a0 # a[0]*a[0]
addze $t2,$t3 # can't overflow
addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
mulhdu $a0,$a0,$a0
adde $acc4,$acc4,$t1
mulld $t1,$a1,$a1 # a[1]*a[1]
adde $acc5,$acc5,$t2
mulhdu $a1,$a1,$a1
addze $acc6,$acc6 # can't overflow
addc $acc1,$acc1,$acc1 # acc[1-6]*=2
mulld $t2,$a2,$a2 # a[2]*a[2]
adde $acc2,$acc2,$acc2
mulhdu $a2,$a2,$a2
adde $acc3,$acc3,$acc3
mulld $t3,$a3,$a3 # a[3]*a[3]
adde $acc4,$acc4,$acc4
mulhdu $a3,$a3,$a3
adde $acc5,$acc5,$acc5
adde $acc6,$acc6,$acc6
addze $acc7,$zr
addc $acc1,$acc1,$a0 # +a[i]*a[i]
mulld $t4,$acc0,$ordk
adde $acc2,$acc2,$t1
adde $acc3,$acc3,$a1
adde $acc4,$acc4,$t2
adde $acc5,$acc5,$a2
adde $acc6,$acc6,$t3
adde $acc7,$acc7,$a3
___
for($i=0; $i<4; $i++) { # reductions
$code.=<<___;
addic $t0,$acc0,-1 # discarded
mulhdu $t1,$ord0,$t4
mulld $t2,$ord1,$t4
mulhdu $t3,$ord1,$t4
adde $t2,$t2,$t1
addze $t3,$t3
addc $acc0,$acc1,$t2
adde $acc1,$acc2,$t3
adde $acc2,$acc3,$t4
adde $acc3,$zr,$t4 # can't overflow
___
$code.=<<___ if ($i<3);
mulld $t3,$acc0,$ordk
___
$code.=<<___;
sldi $t0,$t4,32
subfc $acc1,$t4,$acc1
srdi $t1,$t4,32
subfe $acc2,$t0,$acc2
subfe $acc3,$t1,$acc3 # can't borrow
___
($t3,$t4) = ($t4,$t3);
}
$code.=<<___;
addc $acc0,$acc0,$acc4 # accumulate upper half
adde $acc1,$acc1,$acc5
adde $acc2,$acc2,$acc6
adde $acc3,$acc3,$acc7
addze $acc4,$zr
subfc $acc0,$ord0,$acc0 # ret -= modulus
subfe $acc1,$ord1,$acc1
subfe $acc2,$ord2,$acc2
subfe $acc3,$ord3,$acc3
subfe $acc4,$zr,$acc4
and $t0,$ord0,$acc4
and $t1,$ord1,$acc4
addc $a0,$acc0,$t0 # ret += modulus if borrow
and $t3,$ord3,$acc4
adde $a1,$acc1,$t1
adde $a2,$acc2,$acc4
adde $a3,$acc3,$t3
bdnz .Loop_ord_sqr
std $a0,0($rp)
std $a1,8($rp)
std $a2,16($rp)
std $a3,24($rp)
ld r18,48($sp)
ld r19,56($sp)
ld r20,64($sp)
ld r21,72($sp)
ld r22,80($sp)
ld r23,88($sp)
ld r24,96($sp)
ld r25,104($sp)
ld r26,112($sp)
ld r27,120($sp)
ld r28,128($sp)
ld r29,136($sp)
ld r30,144($sp)
ld r31,152($sp)
addi $sp,$sp,160
blr
.long 0
.byte 0,12,4,0,0x80,14,3,0
.long 0
.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
___
} }
########################################################################
# scatter-gather subroutines
{
my ($out,$inp,$index,$mask)=map("r$_",(3..7));
$code.=<<___;
########################################################################
# void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp,
# int index);
.globl ecp_nistz256_scatter_w5
.align 4
ecp_nistz256_scatter_w5:
slwi $index,$index,2
add $out,$out,$index
ld r8, 0($inp) # X
ld r9, 8($inp)
ld r10,16($inp)
ld r11,24($inp)
stw r8, 64*0-4($out)
srdi r8, r8, 32
stw r9, 64*1-4($out)
srdi r9, r9, 32
stw r10,64*2-4($out)
srdi r10,r10,32
stw r11,64*3-4($out)
srdi r11,r11,32
stw r8, 64*4-4($out)
stw r9, 64*5-4($out)
stw r10,64*6-4($out)
stw r11,64*7-4($out)
addi $out,$out,64*8
ld r8, 32($inp) # Y
ld r9, 40($inp)
ld r10,48($inp)
ld r11,56($inp)
stw r8, 64*0-4($out)
srdi r8, r8, 32
stw r9, 64*1-4($out)
srdi r9, r9, 32
stw r10,64*2-4($out)
srdi r10,r10,32
stw r11,64*3-4($out)
srdi r11,r11,32
stw r8, 64*4-4($out)
stw r9, 64*5-4($out)
stw r10,64*6-4($out)
stw r11,64*7-4($out)
addi $out,$out,64*8
ld r8, 64($inp) # Z
ld r9, 72($inp)
ld r10,80($inp)
ld r11,88($inp)
stw r8, 64*0-4($out)
srdi r8, r8, 32
stw r9, 64*1-4($out)
srdi r9, r9, 32
stw r10,64*2-4($out)
srdi r10,r10,32
stw r11,64*3-4($out)
srdi r11,r11,32
stw r8, 64*4-4($out)
stw r9, 64*5-4($out)
stw r10,64*6-4($out)
stw r11,64*7-4($out)
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
########################################################################
# void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp,
# int index);
.globl ecp_nistz256_gather_w5
.align 4
ecp_nistz256_gather_w5:
neg r0,$index
sradi r0,r0,63
add $index,$index,r0
slwi $index,$index,2
add $inp,$inp,$index
lwz r5, 64*0($inp)
lwz r6, 64*1($inp)
lwz r7, 64*2($inp)
lwz r8, 64*3($inp)
lwz r9, 64*4($inp)
lwz r10,64*5($inp)
lwz r11,64*6($inp)
lwz r12,64*7($inp)
addi $inp,$inp,64*8
sldi r9, r9, 32
sldi r10,r10,32
sldi r11,r11,32
sldi r12,r12,32
or r5,r5,r9
or r6,r6,r10
or r7,r7,r11
or r8,r8,r12
and r5,r5,r0
and r6,r6,r0
and r7,r7,r0
and r8,r8,r0
std r5,0($out) # X
std r6,8($out)
std r7,16($out)
std r8,24($out)
lwz r5, 64*0($inp)
lwz r6, 64*1($inp)
lwz r7, 64*2($inp)
lwz r8, 64*3($inp)
lwz r9, 64*4($inp)
lwz r10,64*5($inp)
lwz r11,64*6($inp)
lwz r12,64*7($inp)
addi $inp,$inp,64*8
sldi r9, r9, 32
sldi r10,r10,32
sldi r11,r11,32
sldi r12,r12,32
or r5,r5,r9
or r6,r6,r10
or r7,r7,r11
or r8,r8,r12
and r5,r5,r0
and r6,r6,r0
and r7,r7,r0
and r8,r8,r0
std r5,32($out) # Y
std r6,40($out)
std r7,48($out)
std r8,56($out)
lwz r5, 64*0($inp)
lwz r6, 64*1($inp)
lwz r7, 64*2($inp)
lwz r8, 64*3($inp)
lwz r9, 64*4($inp)
lwz r10,64*5($inp)
lwz r11,64*6($inp)
lwz r12,64*7($inp)
sldi r9, r9, 32
sldi r10,r10,32
sldi r11,r11,32
sldi r12,r12,32
or r5,r5,r9
or r6,r6,r10
or r7,r7,r11
or r8,r8,r12
and r5,r5,r0
and r6,r6,r0
and r7,r7,r0
and r8,r8,r0
std r5,64($out) # Z
std r6,72($out)
std r7,80($out)
std r8,88($out)
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
########################################################################
# void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp,
# int index);
.globl ecp_nistz256_scatter_w7
.align 4
ecp_nistz256_scatter_w7:
li r0,8
mtctr r0
add $out,$out,$index
subi $inp,$inp,8
.Loop_scatter_w7:
ldu r0,8($inp)
stb r0,64*0-1($out)
srdi r0,r0,8
stb r0,64*1-1($out)
srdi r0,r0,8
stb r0,64*2-1($out)
srdi r0,r0,8
stb r0,64*3-1($out)
srdi r0,r0,8
stb r0,64*4-1($out)
srdi r0,r0,8
stb r0,64*5-1($out)
srdi r0,r0,8
stb r0,64*6-1($out)
srdi r0,r0,8
stb r0,64*7-1($out)
addi $out,$out,64*8
bdnz .Loop_scatter_w7
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
########################################################################
# void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp,
# int index);
.globl ecp_nistz256_gather_w7
.align 4
ecp_nistz256_gather_w7:
li r0,8
mtctr r0
neg r0,$index
sradi r0,r0,63
add $index,$index,r0
add $inp,$inp,$index
subi $out,$out,8
.Loop_gather_w7:
lbz r5, 64*0($inp)
lbz r6, 64*1($inp)
lbz r7, 64*2($inp)
lbz r8, 64*3($inp)
lbz r9, 64*4($inp)
lbz r10,64*5($inp)
lbz r11,64*6($inp)
lbz r12,64*7($inp)
addi $inp,$inp,64*8
sldi r6, r6, 8
sldi r7, r7, 16
sldi r8, r8, 24
sldi r9, r9, 32
sldi r10,r10,40
sldi r11,r11,48
sldi r12,r12,56
or r5,r5,r6
or r7,r7,r8
or r9,r9,r10
or r11,r11,r12
or r5,r5,r7
or r9,r9,r11
or r5,r5,r9
and r5,r5,r0
stdu r5,8($out)
bdnz .Loop_gather_w7
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
___
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
print $_,"\n";
}
close STDOUT; # enforce flush