#! /usr/bin/env perl # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # ECP_NISTZ256 module for PPC64. # # August 2016. # # Original ECP_NISTZ256 submission targeting x86_64 is detailed in # http://eprint.iacr.org/2013/816. # # with/without -DECP_NISTZ256_ASM # POWER7 +260-530% # POWER8 +220-340% $flavour = shift; while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or die "can't locate ppc-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; my $sp="r1"; { my ($rp,$ap,$bp,$bi,$acc0,$acc1,$acc2,$acc3,$poly1,$poly3, $acc4,$acc5,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3) = map("r$_",(3..12,22..31)); my ($acc6,$acc7)=($bp,$bi); # used in __ecp_nistz256_sqr_mont $code.=<<___; .machine "any" .text ___ ######################################################################## # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 # $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; open TABLE,") { s/TOBN$\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*$/push @arr,hex($2),hex($1)/geo; } close TABLE; # See ecp_nistz256_table.c for explanation for why it's 64*16*37. # 64*16*37-1 is because $#arr returns last valid index or @arr, not # amount of elements. die "insane number of elements" if ($#arr != 64*16*37-1); $code.=<<___; .type ecp_nistz256_precomputed,\@object .globl ecp_nistz256_precomputed .align 12 ecp_nistz256_precomputed: ___ ######################################################################## # this conversion smashes P256_POINT_AFFINE by individual bytes with # 64 byte interval, similar to # 1111222233334444 # 1234123412341234 for(1..37) { @tbl = splice(@arr,0,64*16); for($i=0;$i<64;$i++) { undef @line; for($j=0;$j<64;$j++) { push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; } $code.=".byte\t"; $code.=join(',',map { sprintf "0x%02x",$_} @line); $code.="\n"; } } $code.=<<___; .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed .asciz "ECP_NISTZ256 for PPC64, CRYPTOGAMS by " # void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], # const BN_ULONG x2[4]); .globl ecp_nistz256_mul_mont .align 5 ecp_nistz256_mul_mont: stdu $sp,-128($sp) mflr r0 std r22,48($sp) std r23,56($sp) std r24,64($sp) std r25,72($sp) std r26,80($sp) std r27,88($sp) std r28,96($sp) std r29,104($sp) std r30,112($sp) std r31,120($sp) ld $a0,0($ap) ld $bi,0($bp) ld $a1,8($ap) ld $a2,16($ap) ld $a3,24($ap) li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 bl __ecp_nistz256_mul_mont mtlr r0 ld r22,48($sp) ld r23,56($sp) ld r24,64($sp) ld r25,72($sp) ld r26,80($sp) ld r27,88($sp) ld r28,96($sp) ld r29,104($sp) ld r30,112($sp) ld r31,120($sp) addi $sp,$sp,128 blr .long 0 .byte 0,12,4,0,0x80,10,3,0 .long 0 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont # void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_sqr_mont .align 4 ecp_nistz256_sqr_mont: stdu $sp,-128($sp) mflr r0 std r22,48($sp) std r23,56($sp) std r24,64($sp) std r25,72($sp) std r26,80($sp) std r27,88($sp) std r28,96($sp) std r29,104($sp) std r30,112($sp) std r31,120($sp) ld $a0,0($ap) ld $a1,8($ap) ld $a2,16($ap) ld $a3,24($ap) li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 bl __ecp_nistz256_sqr_mont mtlr r0 ld r22,48($sp) ld r23,56($sp) ld r24,64($sp) ld r25,72($sp) ld r26,80($sp) ld r27,88($sp) ld r28,96($sp) ld r29,104($sp) ld r30,112($sp) ld r31,120($sp) addi $sp,$sp,128 blr .long 0 .byte 0,12,4,0,0x80,10,2,0 .long 0 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont # void ecp_nistz256_add(BN_ULONG x0[4],const BN_ULONG x1[4], # const BN_ULONG x2[4]); .globl ecp_nistz256_add .align 4 ecp_nistz256_add: stdu $sp,-128($sp) mflr r0 std r28,96($sp) std r29,104($sp) std r30,112($sp) std r31,120($sp) ld $acc0,0($ap) ld $t0, 0($bp) ld $acc1,8($ap) ld $t1, 8($bp) ld $acc2,16($ap) ld $t2, 16($bp) ld $acc3,24($ap) ld $t3, 24($bp) li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 bl __ecp_nistz256_add mtlr r0 ld r28,96($sp) ld r29,104($sp) ld r30,112($sp) ld r31,120($sp) addi $sp,$sp,128 blr .long 0 .byte 0,12,4,0,0x80,4,3,0 .long 0 .size ecp_nistz256_add,.-ecp_nistz256_add # void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_div_by_2 .align 4 ecp_nistz256_div_by_2: stdu $sp,-128($sp) mflr r0 std r28,96($sp) std r29,104($sp) std r30,112($sp) std r31,120($sp) ld $acc0,0($ap) ld $acc1,8($ap) ld $acc2,16($ap) ld $acc3,24($ap) li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 bl __ecp_nistz256_div_by_2 mtlr r0 ld r28,96($sp) ld r29,104($sp) ld r30,112($sp) ld r31,120($sp) addi $sp,$sp,128 blr .long 0 .byte 0,12,4,0,0x80,4,2,0 .long 0 .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 # void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_mul_by_2 .align 4 ecp_nistz256_mul_by_2: stdu $sp,-128($sp) mflr r0 std r28,96($sp) std r29,104($sp) std r30,112($sp) std r31,120($sp) ld $acc0,0($ap) ld $acc1,8($ap) ld $acc2,16($ap) ld $acc3,24($ap) mr $t0,$acc0 mr $t1,$acc1 mr $t2,$acc2 mr $t3,$acc3 li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 bl __ecp_nistz256_add # ret = a+a // 2*a mtlr r0 ld r28,96($sp) ld r29,104($sp) ld r30,112($sp) ld r31,120($sp) addi $sp,$sp,128 blr .long 0 .byte 0,12,4,0,0x80,4,3,0 .long 0 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 # void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_mul_by_3 .align 4 ecp_nistz256_mul_by_3: stdu $sp,-128($sp) mflr r0 std r28,96($sp) std r29,104($sp) std r30,112($sp) std r31,120($sp) ld $acc0,0($ap) ld $acc1,8($ap) ld $acc2,16($ap) ld $acc3,24($ap) mr $t0,$acc0 std $acc0,64($sp) mr $t1,$acc1 std $acc1,72($sp) mr $t2,$acc2 std $acc2,80($sp) mr $t3,$acc3 std $acc3,88($sp) li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 bl __ecp_nistz256_add # ret = a+a // 2*a ld $t0,64($sp) ld $t1,72($sp) ld $t2,80($sp) ld $t3,88($sp) bl __ecp_nistz256_add # ret += a // 2*a+a=3*a mtlr r0 ld r28,96($sp) ld r29,104($sp) ld r30,112($sp) ld r31,120($sp) addi $sp,$sp,128 blr .long 0 .byte 0,12,4,0,0x80,4,2,0 .long 0 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 # void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], # const BN_ULONG x2[4]); .globl ecp_nistz256_sub .align 4 ecp_nistz256_sub: stdu $sp,-128($sp) mflr r0 std r28,96($sp) std r29,104($sp) std r30,112($sp) std r31,120($sp) ld $acc0,0($ap) ld $acc1,8($ap) ld $acc2,16($ap) ld $acc3,24($ap) li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 bl __ecp_nistz256_sub_from mtlr r0 ld r28,96($sp) ld r29,104($sp) ld r30,112($sp) ld r31,120($sp) addi $sp,$sp,128 blr .long 0 .byte 0,12,4,0,0x80,4,3,0 .long 0 .size ecp_nistz256_sub,.-ecp_nistz256_sub # void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); .globl ecp_nistz256_neg .align 4 ecp_nistz256_neg: stdu $sp,-128($sp) mflr r0 std r28,96($sp) std r29,104($sp) std r30,112($sp) std r31,120($sp) mr $bp,$ap li $acc0,0 li $acc1,0 li $acc2,0 li $acc3,0 li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 bl __ecp_nistz256_sub_from mtlr r0 ld r28,96($sp) ld r29,104($sp) ld r30,112($sp) ld r31,120($sp) addi $sp,$sp,128 blr .long 0 .byte 0,12,4,0,0x80,4,2,0 .long 0 .size ecp_nistz256_neg,.-ecp_nistz256_neg # note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded # to $a0-$a3 and b[0] - to $bi .type __ecp_nistz256_mul_mont,\@function .align 4 __ecp_nistz256_mul_mont: mulld $acc0,$a0,$bi # a[0]*b[0] mulhdu $t0,$a0,$bi mulld $acc1,$a1,$bi # a[1]*b[0] mulhdu $t1,$a1,$bi mulld $acc2,$a2,$bi # a[2]*b[0] mulhdu $t2,$a2,$bi mulld $acc3,$a3,$bi # a[3]*b[0] mulhdu $t3,$a3,$bi ld $bi,8($bp) # b[1] addc $acc1,$acc1,$t0 # accumulate high parts of multiplication sldi $t0,$acc0,32 adde $acc2,$acc2,$t1 srdi $t1,$acc0,32 adde $acc3,$acc3,$t2 addze $acc4,$t3 li $acc5,0 ___ for($i=1;$i<4;$i++) { ################################################################ # Reduction iteration is normally performed by accumulating # result of multiplication of modulus by "magic" digit [and # omitting least significant word, which is guaranteed to # be 0], but thanks to special form of modulus and "magic" # digit being equal to least significant word, it can be # performed with additions and subtractions alone. Indeed: # # ffff0001.00000000.0000ffff.ffffffff # * abcdefgh # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh # # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we # rewrite above as: # # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000 # - 0000abcd.efgh0000.00000000.00000000.abcdefgh # # or marking redundant operations: # # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.-------- # + abcdefgh.abcdefgh.0000abcd.efgh0000.-------- # - 0000abcd.efgh0000.--------.--------.-------- $code.=<<___; subfc $t2,$t0,$acc0 # "*0xffff0001" subfe $t3,$t1,$acc0 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] adde $acc1,$acc2,$t1 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 adde $acc3,$acc4,$t3 addze $acc4,$acc5 mulld $t0,$a0,$bi # lo(a[0]*b[i]) mulld $t1,$a1,$bi # lo(a[1]*b[i]) mulld $t2,$a2,$bi # lo(a[2]*b[i]) mulld $t3,$a3,$bi # lo(a[3]*b[i]) addc $acc0,$acc0,$t0 # accumulate low parts of multiplication mulhdu $t0,$a0,$bi # hi(a[0]*b[i]) adde $acc1,$acc1,$t1 mulhdu $t1,$a1,$bi # hi(a[1]*b[i]) adde $acc2,$acc2,$t2 mulhdu $t2,$a2,$bi # hi(a[2]*b[i]) adde $acc3,$acc3,$t3 mulhdu $t3,$a3,$bi # hi(a[3]*b[i]) addze $acc4,$acc4 ___ $code.=<<___ if ($i<3); ld $bi,8*($i+1)($bp) # b[$i+1] ___ $code.=<<___; addc $acc1,$acc1,$t0 # accumulate high parts of multiplication sldi $t0,$acc0,32 adde $acc2,$acc2,$t1 srdi $t1,$acc0,32 adde $acc3,$acc3,$t2 adde $acc4,$acc4,$t3 li $acc5,0 addze $acc5,$acc5 ___ } $code.=<<___; # last reduction subfc $t2,$t0,$acc0 # "*0xffff0001" subfe $t3,$t1,$acc0 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] adde $acc1,$acc2,$t1 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 adde $acc3,$acc4,$t3 addze $acc4,$acc5 li $t2,0 addic $acc0,$acc0,1 # ret -= modulus subfe $acc1,$poly1,$acc1 subfe $acc2,$t2,$acc2 subfe $acc3,$poly3,$acc3 subfe $acc4,$t2,$acc4 addc $acc0,$acc0,$acc4 # ret += modulus if borrow and $t1,$poly1,$acc4 and $t3,$poly3,$acc4 adde $acc1,$acc1,$t1 addze $acc2,$acc2 adde $acc3,$acc3,$t3 std $acc0,0($rp) std $acc1,8($rp) std $acc2,16($rp) std $acc3,24($rp) blr .long 0 .byte 0,12,0x14,0,0,0,1,0 .long 0 .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont # note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded # to $a0-$a3 .type __ecp_nistz256_sqr_mont,\@function .align 4 __ecp_nistz256_sqr_mont: ################################################################ # | | | | | |a1*a0| | # | | | | |a2*a0| | | # | |a3*a2|a3*a0| | | | # | | | |a2*a1| | | | # | | |a3*a1| | | | | # *| | | | | | | | 2| # +|a3*a3|a2*a2|a1*a1|a0*a0| # |--+--+--+--+--+--+--+--| # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx # # "can't overflow" below mark carrying into high part of # multiplication result, which can't overflow, because it # can never be all ones. mulld $acc1,$a1,$a0 # a[1]*a[0] mulhdu $t1,$a1,$a0 mulld $acc2,$a2,$a0 # a[2]*a[0] mulhdu $t2,$a2,$a0 mulld $acc3,$a3,$a0 # a[3]*a[0] mulhdu $acc4,$a3,$a0 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication mulld $t0,$a2,$a1 # a[2]*a[1] mulhdu $t1,$a2,$a1 adde $acc3,$acc3,$t2 mulld $t2,$a3,$a1 # a[3]*a[1] mulhdu $t3,$a3,$a1 addze $acc4,$acc4 # can't overflow mulld $acc5,$a3,$a2 # a[3]*a[2] mulhdu $acc6,$a3,$a2 addc $t1,$t1,$t2 # accumulate high parts of multiplication addze $t2,$t3 # can't overflow addc $acc3,$acc3,$t0 # accumulate low parts of multiplication adde $acc4,$acc4,$t1 adde $acc5,$acc5,$t2 addze $acc6,$acc6 # can't overflow addc $acc1,$acc1,$acc1 # acc[1-6]*=2 adde $acc2,$acc2,$acc2 adde $acc3,$acc3,$acc3 adde $acc4,$acc4,$acc4 adde $acc5,$acc5,$acc5 adde $acc6,$acc6,$acc6 li $acc7,0 addze $acc7,$acc7 mulld $acc0,$a0,$a0 # a[0]*a[0] mulhdu $a0,$a0,$a0 mulld $t1,$a1,$a1 # a[1]*a[1] mulhdu $a1,$a1,$a1 mulld $t2,$a2,$a2 # a[2]*a[2] mulhdu $a2,$a2,$a2 mulld $t3,$a3,$a3 # a[3]*a[3] mulhdu $a3,$a3,$a3 addc $acc1,$acc1,$a0 # +a[i]*a[i] sldi $t0,$acc0,32 adde $acc2,$acc2,$t1 srdi $t1,$acc0,32 adde $acc3,$acc3,$a1 adde $acc4,$acc4,$t2 adde $acc5,$acc5,$a2 adde $acc6,$acc6,$t3 adde $acc7,$acc7,$a3 ___ for($i=0;$i<3;$i++) { # reductions, see commentary in # multiplication for details $code.=<<___; subfc $t2,$t0,$acc0 # "*0xffff0001" subfe $t3,$t1,$acc0 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] sldi $t0,$acc0,32 adde $acc1,$acc2,$t1 srdi $t1,$acc0,32 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 addze $acc3,$t3 # can't overflow ___ } $code.=<<___; subfc $t2,$t0,$acc0 # "*0xffff0001" subfe $t3,$t1,$acc0 addc $acc0,$acc1,$t0 # +=acc[0]<<96 and omit acc[0] adde $acc1,$acc2,$t1 adde $acc2,$acc3,$t2 # +=acc[0]*0xffff0001 addze $acc3,$t3 # can't overflow addc $acc0,$acc0,$acc4 # accumulate upper half adde $acc1,$acc1,$acc5 adde $acc2,$acc2,$acc6 adde $acc3,$acc3,$acc7 li $t2,0 addze $acc4,$t2 addic $acc0,$acc0,1 # ret -= modulus subfe $acc1,$poly1,$acc1 subfe $acc2,$t2,$acc2 subfe $acc3,$poly3,$acc3 subfe $acc4,$t2,$acc4 addc $acc0,$acc0,$acc4 # ret += modulus if borrow and $t1,$poly1,$acc4 and $t3,$poly3,$acc4 adde $acc1,$acc1,$t1 addze $acc2,$acc2 adde $acc3,$acc3,$t3 std $acc0,0($rp) std $acc1,8($rp) std $acc2,16($rp) std $acc3,24($rp) blr .long 0 .byte 0,12,0x14,0,0,0,1,0 .long 0 .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont # Note that __ecp_nistz256_add expects both input vectors pre-loaded to # $a0-$a3 and $t0-$t3. This is done because it's used in multiple # contexts, e.g. in multiplication by 2 and 3... .type __ecp_nistz256_add,\@function .align 4 __ecp_nistz256_add: addc $acc0,$acc0,$t0 # ret = a+b adde $acc1,$acc1,$t1 adde $acc2,$acc2,$t2 li $t2,0 adde $acc3,$acc3,$t3 addze $t0,$t2 # if a+b >= modulus, subtract modulus # # But since comparison implies subtraction, we subtract # modulus and then add it back if subtraction borrowed. subic $acc0,$acc0,-1 subfe $acc1,$poly1,$acc1 subfe $acc2,$t2,$acc2 subfe $acc3,$poly3,$acc3 subfe $t0,$t2,$t0 addc $acc0,$acc0,$t0 and $t1,$poly1,$t0 and $t3,$poly3,$t0 adde $acc1,$acc1,$t1 addze $acc2,$acc2 adde $acc3,$acc3,$t3 std $acc0,0($rp) std $acc1,8($rp) std $acc2,16($rp) std $acc3,24($rp) blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 .size __ecp_nistz256_add,.-__ecp_nistz256_add .type __ecp_nistz256_sub_from,\@function .align 4 __ecp_nistz256_sub_from: ld $t0,0($bp) ld $t1,8($bp) ld $t2,16($bp) ld $t3,24($bp) subfc $acc0,$t0,$acc0 # ret = a-b subfe $acc1,$t1,$acc1 subfe $acc2,$t2,$acc2 subfe $acc3,$t3,$acc3 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 # if a-b borrowed, add modulus addc $acc0,$acc0,$t0 # ret -= modulus & t0 and $t1,$poly1,$t0 and $t3,$poly3,$t0 adde $acc1,$acc1,$t1 addze $acc2,$acc2 adde $acc3,$acc3,$t3 std $acc0,0($rp) std $acc1,8($rp) std $acc2,16($rp) std $acc3,24($rp) blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from .type __ecp_nistz256_sub_morf,\@function .align 4 __ecp_nistz256_sub_morf: ld $t0,0($bp) ld $t1,8($bp) ld $t2,16($bp) ld $t3,24($bp) subfc $acc0,$acc0,$t0 # ret = b-a subfe $acc1,$acc1,$t1 subfe $acc2,$acc2,$t2 subfe $acc3,$acc3,$t3 subfe $t0,$t0,$t0 # t0 = borrow ? -1 : 0 # if b-a borrowed, add modulus addc $acc0,$acc0,$t0 # ret -= modulus & t0 and $t1,$poly1,$t0 and $t3,$poly3,$t0 adde $acc1,$acc1,$t1 addze $acc2,$acc2 adde $acc3,$acc3,$t3 std $acc0,0($rp) std $acc1,8($rp) std $acc2,16($rp) std $acc3,24($rp) blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf .type __ecp_nistz256_div_by_2,\@function .align 4 __ecp_nistz256_div_by_2: andi. $t0,$acc0,1 addic $acc0,$acc0,-1 # a += modulus neg $t0,$t0 adde $acc1,$acc1,$poly1 not $t0,$t0 addze $acc2,$acc2 li $t2,0 adde $acc3,$acc3,$poly3 and $t1,$poly1,$t0 addze $ap,$t2 # ap = carry and $t3,$poly3,$t0 subfc $acc0,$t0,$acc0 # a -= modulus if a was even subfe $acc1,$t1,$acc1 subfe $acc2,$t2,$acc2 subfe $acc3,$t3,$acc3 subfe $ap, $t2,$ap srdi $acc0,$acc0,1 sldi $t0,$acc1,63 srdi $acc1,$acc1,1 sldi $t1,$acc2,63 srdi $acc2,$acc2,1 sldi $t2,$acc3,63 srdi $acc3,$acc3,1 sldi $t3,$ap,63 or $acc0,$acc0,$t0 or $acc1,$acc1,$t1 or $acc2,$acc2,$t2 or $acc3,$acc3,$t3 std $acc0,0($rp) std $acc1,8($rp) std $acc2,16($rp) std $acc3,24($rp) blr .long 0 .byte 0,12,0x14,0,0,0,1,0 .long 0 .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 ___ ######################################################################## # following subroutines are "literal" implementation of those found in # ecp_nistz256.c # ######################################################################## # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); # if (1) { my $FRAME=64+32*4+12*8; my ($S,$M,$Zsqr,$tmp0)=map(64+32*$_,(0..3)); # above map() describes stack layout with 4 temporary # 256-bit vectors on top. my ($rp_real,$ap_real) = map("r$_",(20,21)); $code.=<<___; .globl ecp_nistz256_point_double .align 5 ecp_nistz256_point_double: stdu $sp,-$FRAME($sp) mflr r0 std r20,$FRAME-8*12($sp) std r21,$FRAME-8*11($sp) std r22,$FRAME-8*10($sp) std r23,$FRAME-8*9($sp) std r24,$FRAME-8*8($sp) std r25,$FRAME-8*7($sp) std r26,$FRAME-8*6($sp) std r27,$FRAME-8*5($sp) std r28,$FRAME-8*4($sp) std r29,$FRAME-8*3($sp) std r30,$FRAME-8*2($sp) std r31,$FRAME-8*1($sp) li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 .Ldouble_shortcut: ld $acc0,32($ap) ld $acc1,40($ap) ld $acc2,48($ap) ld $acc3,56($ap) mr $t0,$acc0 mr $t1,$acc1 mr $t2,$acc2 mr $t3,$acc3 ld $a0,64($ap) # forward load for p256_sqr_mont ld $a1,72($ap) ld $a2,80($ap) ld $a3,88($ap) mr $rp_real,$rp mr $ap_real,$ap addi $rp,$sp,$S bl __ecp_nistz256_add # p256_mul_by_2(S, in_y); addi $rp,$sp,$Zsqr bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Zsqr, in_z); ld $t0,0($ap_real) ld $t1,8($ap_real) ld $t2,16($ap_real) ld $t3,24($ap_real) mr $a0,$acc0 # put Zsqr aside for p256_sub mr $a1,$acc1 mr $a2,$acc2 mr $a3,$acc3 addi $rp,$sp,$M bl __ecp_nistz256_add # p256_add(M, Zsqr, in_x); addi $bp,$ap_real,0 mr $acc0,$a0 # restore Zsqr mr $acc1,$a1 mr $acc2,$a2 mr $acc3,$a3 ld $a0,$S+0($sp) # forward load for p256_sqr_mont ld $a1,$S+8($sp) ld $a2,$S+16($sp) ld $a3,$S+24($sp) addi $rp,$sp,$Zsqr bl __ecp_nistz256_sub_morf # p256_sub(Zsqr, in_x, Zsqr); addi $rp,$sp,$S bl __ecp_nistz256_sqr_mont # p256_sqr_mont(S, S); ld $bi,32($ap_real) ld $a0,64($ap_real) ld $a1,72($ap_real) ld $a2,80($ap_real) ld $a3,88($ap_real) addi $bp,$ap_real,32 addi $rp,$sp,$tmp0 bl __ecp_nistz256_mul_mont # p256_mul_mont(tmp0, in_z, in_y); mr $t0,$acc0 mr $t1,$acc1 mr $t2,$acc2 mr $t3,$acc3 ld $a0,$S+0($sp) # forward load for p256_sqr_mont ld $a1,$S+8($sp) ld $a2,$S+16($sp) ld $a3,$S+24($sp) addi $rp,$rp_real,64 bl __ecp_nistz256_add # p256_mul_by_2(res_z, tmp0); addi $rp,$sp,$tmp0 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(tmp0, S); ld $bi,$Zsqr($sp) # forward load for p256_mul_mont ld $a0,$M+0($sp) ld $a1,$M+8($sp) ld $a2,$M+16($sp) ld $a3,$M+24($sp) addi $rp,$rp_real,32 bl __ecp_nistz256_div_by_2 # p256_div_by_2(res_y, tmp0); addi $bp,$sp,$Zsqr addi $rp,$sp,$M bl __ecp_nistz256_mul_mont # p256_mul_mont(M, M, Zsqr); mr $t0,$acc0 # duplicate M mr $t1,$acc1 mr $t2,$acc2 mr $t3,$acc3 mr $a0,$acc0 # put M aside mr $a1,$acc1 mr $a2,$acc2 mr $a3,$acc3 addi $rp,$sp,$M bl __ecp_nistz256_add mr $t0,$a0 # restore M mr $t1,$a1 mr $t2,$a2 mr $t3,$a3 ld $bi,0($ap_real) # forward load for p256_mul_mont ld $a0,$S+0($sp) ld $a1,$S+8($sp) ld $a2,$S+16($sp) ld $a3,$S+24($sp) bl __ecp_nistz256_add # p256_mul_by_3(M, M); addi $bp,$ap_real,0 addi $rp,$sp,$S bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, in_x); mr $t0,$acc0 mr $t1,$acc1 mr $t2,$acc2 mr $t3,$acc3 ld $a0,$M+0($sp) # forward load for p256_sqr_mont ld $a1,$M+8($sp) ld $a2,$M+16($sp) ld $a3,$M+24($sp) addi $rp,$sp,$tmp0 bl __ecp_nistz256_add # p256_mul_by_2(tmp0, S); addi $rp,$rp_real,0 bl __ecp_nistz256_sqr_mont # p256_sqr_mont(res_x, M); addi $bp,$sp,$tmp0 bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, tmp0); addi $bp,$sp,$S addi $rp,$sp,$S bl __ecp_nistz256_sub_morf # p256_sub(S, S, res_x); ld $bi,$M($sp) mr $a0,$acc0 # copy S mr $a1,$acc1 mr $a2,$acc2 mr $a3,$acc3 addi $bp,$sp,$M bl __ecp_nistz256_mul_mont # p256_mul_mont(S, S, M); addi $bp,$rp_real,32 addi $rp,$rp_real,32 bl __ecp_nistz256_sub_from # p256_sub(res_y, S, res_y); mtlr r0 ld r20,$FRAME-8*12($sp) ld r21,$FRAME-8*11($sp) ld r22,$FRAME-8*10($sp) ld r23,$FRAME-8*9($sp) ld r24,$FRAME-8*8($sp) ld r25,$FRAME-8*7($sp) ld r26,$FRAME-8*6($sp) ld r27,$FRAME-8*5($sp) ld r28,$FRAME-8*4($sp) ld r29,$FRAME-8*3($sp) ld r30,$FRAME-8*2($sp) ld r31,$FRAME-8*1($sp) addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,0,0x80,12,2,0 .long 0 .size ecp_nistz256_point_double,.-ecp_nistz256_point_double ___ } ######################################################################## # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, # const P256_POINT *in2); if (1) { my $FRAME = 64 + 32*12 + 16*8; my ($res_x,$res_y,$res_z, $H,$Hsqr,$R,$Rsqr,$Hcub, $U1,$U2,$S1,$S2)=map(64+32*$_,(0..11)); my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); # above map() describes stack layout with 12 temporary # 256-bit vectors on top. my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); $code.=<<___; .globl ecp_nistz256_point_add .align 5 ecp_nistz256_point_add: stdu $sp,-$FRAME($sp) mflr r0 std r16,$FRAME-8*16($sp) std r17,$FRAME-8*15($sp) std r18,$FRAME-8*14($sp) std r19,$FRAME-8*13($sp) std r20,$FRAME-8*12($sp) std r21,$FRAME-8*11($sp) std r22,$FRAME-8*10($sp) std r23,$FRAME-8*9($sp) std r24,$FRAME-8*8($sp) std r25,$FRAME-8*7($sp) std r26,$FRAME-8*6($sp) std r27,$FRAME-8*5($sp) std r28,$FRAME-8*4($sp) std r29,$FRAME-8*3($sp) std r30,$FRAME-8*2($sp) std r31,$FRAME-8*1($sp) li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 ld $a0,64($bp) # in2_z ld $a1,72($bp) ld $a2,80($bp) ld $a3,88($bp) mr $rp_real,$rp mr $ap_real,$ap mr $bp_real,$bp or $t0,$a0,$a1 or $t2,$a2,$a3 or $in2infty,$t0,$t2 neg $t0,$in2infty or $in2infty,$in2infty,$t0 sradi $in2infty,$in2infty,63 # !in2infty addi $rp,$sp,$Z2sqr bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z2sqr, in2_z); ld $a0,64($ap_real) # in1_z ld $a1,72($ap_real) ld $a2,80($ap_real) ld $a3,88($ap_real) or $t0,$a0,$a1 or $t2,$a2,$a3 or $in1infty,$t0,$t2 neg $t0,$in1infty or $in1infty,$in1infty,$t0 sradi $in1infty,$in1infty,63 # !in1infty addi $rp,$sp,$Z1sqr bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); ld $bi,64($bp_real) ld $a0,$Z2sqr+0($sp) ld $a1,$Z2sqr+8($sp) ld $a2,$Z2sqr+16($sp) ld $a3,$Z2sqr+24($sp) addi $bp,$bp_real,64 addi $rp,$sp,$S1 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, Z2sqr, in2_z); ld $bi,64($ap_real) ld $a0,$Z1sqr+0($sp) ld $a1,$Z1sqr+8($sp) ld $a2,$Z1sqr+16($sp) ld $a3,$Z1sqr+24($sp) addi $bp,$ap_real,64 addi $rp,$sp,$S2 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); ld $bi,32($ap_real) ld $a0,$S1+0($sp) ld $a1,$S1+8($sp) ld $a2,$S1+16($sp) ld $a3,$S1+24($sp) addi $bp,$ap_real,32 addi $rp,$sp,$S1 bl __ecp_nistz256_mul_mont # p256_mul_mont(S1, S1, in1_y); ld $bi,32($bp_real) ld $a0,$S2+0($sp) ld $a1,$S2+8($sp) ld $a2,$S2+16($sp) ld $a3,$S2+24($sp) addi $bp,$bp_real,32 addi $rp,$sp,$S2 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); addi $bp,$sp,$S1 ld $bi,$Z2sqr($sp) # forward load for p256_mul_mont ld $a0,0($ap_real) ld $a1,8($ap_real) ld $a2,16($ap_real) ld $a3,24($ap_real) addi $rp,$sp,$R bl __ecp_nistz256_sub_from # p256_sub(R, S2, S1); or $acc0,$acc0,$acc1 # see if result is zero or $acc2,$acc2,$acc3 or $temp,$acc0,$acc2 addi $bp,$sp,$Z2sqr addi $rp,$sp,$U1 bl __ecp_nistz256_mul_mont # p256_mul_mont(U1, in1_x, Z2sqr); ld $bi,$Z1sqr($sp) ld $a0,0($bp_real) ld $a1,8($bp_real) ld $a2,16($bp_real) ld $a3,24($bp_real) addi $bp,$sp,$Z1sqr addi $rp,$sp,$U2 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in2_x, Z1sqr); addi $bp,$sp,$U1 ld $a0,$R+0($sp) # forward load for p256_sqr_mont ld $a1,$R+8($sp) ld $a2,$R+16($sp) ld $a3,$R+24($sp) addi $rp,$sp,$H bl __ecp_nistz256_sub_from # p256_sub(H, U2, U1); or $acc0,$acc0,$acc1 # see if result is zero or $acc2,$acc2,$acc3 or. $acc0,$acc0,$acc2 bne .Ladd_proceed # is_equal(U1,U2)? and. $t0,$in1infty,$in2infty beq .Ladd_proceed # (in1infty || in2infty)? cmpldi $temp,0 beq .Ladd_double # is_equal(S1,S2)? xor $a0,$a0,$a0 std $a0,0($rp_real) std $a0,8($rp_real) std $a0,16($rp_real) std $a0,24($rp_real) std $a0,32($rp_real) std $a0,40($rp_real) std $a0,48($rp_real) std $a0,56($rp_real) std $a0,64($rp_real) std $a0,72($rp_real) std $a0,80($rp_real) std $a0,88($rp_real) b .Ladd_done .align 4 .Ladd_double: ld $bp,0($sp) # back-link mr $ap,$ap_real mr $rp,$rp_real ld r16,$FRAME-8*16($sp) ld r17,$FRAME-8*15($sp) ld r18,$FRAME-8*14($sp) ld r19,$FRAME-8*13($sp) stdu $bp,$FRAME-288($sp) # difference in stack frame sizes b .Ldouble_shortcut .align 4 .Ladd_proceed: addi $rp,$sp,$Rsqr bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); ld $bi,64($ap_real) ld $a0,$H+0($sp) ld $a1,$H+8($sp) ld $a2,$H+16($sp) ld $a3,$H+24($sp) addi $bp,$ap_real,64 addi $rp,$sp,$res_z bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); ld $a0,$H+0($sp) ld $a1,$H+8($sp) ld $a2,$H+16($sp) ld $a3,$H+24($sp) addi $rp,$sp,$Hsqr bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); ld $bi,64($bp_real) ld $a0,$res_z+0($sp) ld $a1,$res_z+8($sp) ld $a2,$res_z+16($sp) ld $a3,$res_z+24($sp) addi $bp,$bp_real,64 addi $rp,$sp,$res_z bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, res_z, in2_z); ld $bi,$H($sp) ld $a0,$Hsqr+0($sp) ld $a1,$Hsqr+8($sp) ld $a2,$Hsqr+16($sp) ld $a3,$Hsqr+24($sp) addi $bp,$sp,$H addi $rp,$sp,$Hcub bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); ld $bi,$Hsqr($sp) ld $a0,$U1+0($sp) ld $a1,$U1+8($sp) ld $a2,$U1+16($sp) ld $a3,$U1+24($sp) addi $bp,$sp,$Hsqr addi $rp,$sp,$U2 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, U1, Hsqr); mr $t0,$acc0 mr $t1,$acc1 mr $t2,$acc2 mr $t3,$acc3 addi $rp,$sp,$Hsqr bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); addi $bp,$sp,$Rsqr addi $rp,$sp,$res_x bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); addi $bp,$sp,$Hcub bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); addi $bp,$sp,$U2 ld $bi,$Hcub($sp) # forward load for p256_mul_mont ld $a0,$S1+0($sp) ld $a1,$S1+8($sp) ld $a2,$S1+16($sp) ld $a3,$S1+24($sp) addi $rp,$sp,$res_y bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); addi $bp,$sp,$Hcub addi $rp,$sp,$S2 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S1, Hcub); ld $bi,$R($sp) ld $a0,$res_y+0($sp) ld $a1,$res_y+8($sp) ld $a2,$res_y+16($sp) ld $a3,$res_y+24($sp) addi $bp,$sp,$R addi $rp,$sp,$res_y bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); addi $bp,$sp,$S2 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); ld $t0,0($bp_real) # in2 ld $t1,8($bp_real) ld $t2,16($bp_real) ld $t3,24($bp_real) ld $a0,$res_x+0($sp) # res ld $a1,$res_x+8($sp) ld $a2,$res_x+16($sp) ld $a3,$res_x+24($sp) ___ for($i=0;$i<64;$i+=32) { # conditional moves $code.=<<___; ld $acc0,$i+0($ap_real) # in1 ld $acc1,$i+8($ap_real) ld $acc2,$i+16($ap_real) ld $acc3,$i+24($ap_real) andc $t0,$t0,$in1infty andc $t1,$t1,$in1infty andc $t2,$t2,$in1infty andc $t3,$t3,$in1infty and $a0,$a0,$in1infty and $a1,$a1,$in1infty and $a2,$a2,$in1infty and $a3,$a3,$in1infty or $t0,$t0,$a0 or $t1,$t1,$a1 or $t2,$t2,$a2 or $t3,$t3,$a3 andc $acc0,$acc0,$in2infty andc $acc1,$acc1,$in2infty andc $acc2,$acc2,$in2infty andc $acc3,$acc3,$in2infty and $t0,$t0,$in2infty and $t1,$t1,$in2infty and $t2,$t2,$in2infty and $t3,$t3,$in2infty or $acc0,$acc0,$t0 or $acc1,$acc1,$t1 or $acc2,$acc2,$t2 or $acc3,$acc3,$t3 ld $t0,$i+32($bp_real) # in2 ld $t1,$i+40($bp_real) ld $t2,$i+48($bp_real) ld $t3,$i+56($bp_real) ld $a0,$res_x+$i+32($sp) ld $a1,$res_x+$i+40($sp) ld $a2,$res_x+$i+48($sp) ld $a3,$res_x+$i+56($sp) std $acc0,$i+0($rp_real) std $acc1,$i+8($rp_real) std $acc2,$i+16($rp_real) std $acc3,$i+24($rp_real) ___ } $code.=<<___; ld $acc0,$i+0($ap_real) # in1 ld $acc1,$i+8($ap_real) ld $acc2,$i+16($ap_real) ld $acc3,$i+24($ap_real) andc $t0,$t0,$in1infty andc $t1,$t1,$in1infty andc $t2,$t2,$in1infty andc $t3,$t3,$in1infty and $a0,$a0,$in1infty and $a1,$a1,$in1infty and $a2,$a2,$in1infty and $a3,$a3,$in1infty or $t0,$t0,$a0 or $t1,$t1,$a1 or $t2,$t2,$a2 or $t3,$t3,$a3 andc $acc0,$acc0,$in2infty andc $acc1,$acc1,$in2infty andc $acc2,$acc2,$in2infty andc $acc3,$acc3,$in2infty and $t0,$t0,$in2infty and $t1,$t1,$in2infty and $t2,$t2,$in2infty and $t3,$t3,$in2infty or $acc0,$acc0,$t0 or $acc1,$acc1,$t1 or $acc2,$acc2,$t2 or $acc3,$acc3,$t3 std $acc0,$i+0($rp_real) std $acc1,$i+8($rp_real) std $acc2,$i+16($rp_real) std $acc3,$i+24($rp_real) .Ladd_done: mtlr r0 ld r16,$FRAME-8*16($sp) ld r17,$FRAME-8*15($sp) ld r18,$FRAME-8*14($sp) ld r19,$FRAME-8*13($sp) ld r20,$FRAME-8*12($sp) ld r21,$FRAME-8*11($sp) ld r22,$FRAME-8*10($sp) ld r23,$FRAME-8*9($sp) ld r24,$FRAME-8*8($sp) ld r25,$FRAME-8*7($sp) ld r26,$FRAME-8*6($sp) ld r27,$FRAME-8*5($sp) ld r28,$FRAME-8*4($sp) ld r29,$FRAME-8*3($sp) ld r30,$FRAME-8*2($sp) ld r31,$FRAME-8*1($sp) addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,0,0x80,16,3,0 .long 0 .size ecp_nistz256_point_add,.-ecp_nistz256_point_add ___ } ######################################################################## # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, # const P256_POINT_AFFINE *in2); if (1) { my $FRAME = 64 + 32*10 + 16*8; my ($res_x,$res_y,$res_z, $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(64+32*$_,(0..9)); my $Z1sqr = $S2; # above map() describes stack layout with 10 temporary # 256-bit vectors on top. my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("r$_",(16..21)); $code.=<<___; .globl ecp_nistz256_point_add_affine .align 5 ecp_nistz256_point_add_affine: stdu $sp,-$FRAME($sp) mflr r0 std r16,$FRAME-8*16($sp) std r17,$FRAME-8*15($sp) std r18,$FRAME-8*14($sp) std r19,$FRAME-8*13($sp) std r20,$FRAME-8*12($sp) std r21,$FRAME-8*11($sp) std r22,$FRAME-8*10($sp) std r23,$FRAME-8*9($sp) std r24,$FRAME-8*8($sp) std r25,$FRAME-8*7($sp) std r26,$FRAME-8*6($sp) std r27,$FRAME-8*5($sp) std r28,$FRAME-8*4($sp) std r29,$FRAME-8*3($sp) std r30,$FRAME-8*2($sp) std r31,$FRAME-8*1($sp) li $poly1,-1 srdi $poly1,$poly1,32 # 0x00000000ffffffff li $poly3,1 orc $poly3,$poly3,$poly1 # 0xffffffff00000001 mr $rp_real,$rp mr $ap_real,$ap mr $bp_real,$bp ld $a0,64($ap) # in1_z ld $a1,72($ap) ld $a2,80($ap) ld $a3,88($ap) or $t0,$a0,$a1 or $t2,$a2,$a3 or $in1infty,$t0,$t2 neg $t0,$in1infty or $in1infty,$in1infty,$t0 sradi $in1infty,$in1infty,63 # !in1infty ld $acc0,0($bp) # in2_x ld $acc1,8($bp) ld $acc2,16($bp) ld $acc3,24($bp) ld $t0,32($bp) # in2_y ld $t1,40($bp) ld $t2,48($bp) ld $t3,56($bp) or $acc0,$acc0,$acc1 or $acc2,$acc2,$acc3 or $acc0,$acc0,$acc2 or $t0,$t0,$t1 or $t2,$t2,$t3 or $t0,$t0,$t2 or $in2infty,$acc0,$t0 neg $t0,$in2infty or $in2infty,$in2infty,$t0 sradi $in2infty,$in2infty,63 # !in2infty addi $rp,$sp,$Z1sqr bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Z1sqr, in1_z); mr $a0,$acc0 mr $a1,$acc1 mr $a2,$acc2 mr $a3,$acc3 ld $bi,0($bp_real) addi $bp,$bp_real,0 addi $rp,$sp,$U2 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, Z1sqr, in2_x); addi $bp,$ap_real,0 ld $bi,64($ap_real) # forward load for p256_mul_mont ld $a0,$Z1sqr+0($sp) ld $a1,$Z1sqr+8($sp) ld $a2,$Z1sqr+16($sp) ld $a3,$Z1sqr+24($sp) addi $rp,$sp,$H bl __ecp_nistz256_sub_from # p256_sub(H, U2, in1_x); addi $bp,$ap_real,64 addi $rp,$sp,$S2 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, Z1sqr, in1_z); ld $bi,64($ap_real) ld $a0,$H+0($sp) ld $a1,$H+8($sp) ld $a2,$H+16($sp) ld $a3,$H+24($sp) addi $bp,$ap_real,64 addi $rp,$sp,$res_z bl __ecp_nistz256_mul_mont # p256_mul_mont(res_z, H, in1_z); ld $bi,32($bp_real) ld $a0,$S2+0($sp) ld $a1,$S2+8($sp) ld $a2,$S2+16($sp) ld $a3,$S2+24($sp) addi $bp,$bp_real,32 addi $rp,$sp,$S2 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, S2, in2_y); addi $bp,$ap_real,32 ld $a0,$H+0($sp) # forward load for p256_sqr_mont ld $a1,$H+8($sp) ld $a2,$H+16($sp) ld $a3,$H+24($sp) addi $rp,$sp,$R bl __ecp_nistz256_sub_from # p256_sub(R, S2, in1_y); addi $rp,$sp,$Hsqr bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Hsqr, H); ld $a0,$R+0($sp) ld $a1,$R+8($sp) ld $a2,$R+16($sp) ld $a3,$R+24($sp) addi $rp,$sp,$Rsqr bl __ecp_nistz256_sqr_mont # p256_sqr_mont(Rsqr, R); ld $bi,$H($sp) ld $a0,$Hsqr+0($sp) ld $a1,$Hsqr+8($sp) ld $a2,$Hsqr+16($sp) ld $a3,$Hsqr+24($sp) addi $bp,$sp,$H addi $rp,$sp,$Hcub bl __ecp_nistz256_mul_mont # p256_mul_mont(Hcub, Hsqr, H); ld $bi,0($ap_real) ld $a0,$Hsqr+0($sp) ld $a1,$Hsqr+8($sp) ld $a2,$Hsqr+16($sp) ld $a3,$Hsqr+24($sp) addi $bp,$ap_real,0 addi $rp,$sp,$U2 bl __ecp_nistz256_mul_mont # p256_mul_mont(U2, in1_x, Hsqr); mr $t0,$acc0 mr $t1,$acc1 mr $t2,$acc2 mr $t3,$acc3 addi $rp,$sp,$Hsqr bl __ecp_nistz256_add # p256_mul_by_2(Hsqr, U2); addi $bp,$sp,$Rsqr addi $rp,$sp,$res_x bl __ecp_nistz256_sub_morf # p256_sub(res_x, Rsqr, Hsqr); addi $bp,$sp,$Hcub bl __ecp_nistz256_sub_from # p256_sub(res_x, res_x, Hcub); addi $bp,$sp,$U2 ld $bi,32($ap_real) # forward load for p256_mul_mont ld $a0,$Hcub+0($sp) ld $a1,$Hcub+8($sp) ld $a2,$Hcub+16($sp) ld $a3,$Hcub+24($sp) addi $rp,$sp,$res_y bl __ecp_nistz256_sub_morf # p256_sub(res_y, U2, res_x); addi $bp,$ap_real,32 addi $rp,$sp,$S2 bl __ecp_nistz256_mul_mont # p256_mul_mont(S2, in1_y, Hcub); ld $bi,$R($sp) ld $a0,$res_y+0($sp) ld $a1,$res_y+8($sp) ld $a2,$res_y+16($sp) ld $a3,$res_y+24($sp) addi $bp,$sp,$R addi $rp,$sp,$res_y bl __ecp_nistz256_mul_mont # p256_mul_mont(res_y, res_y, R); addi $bp,$sp,$S2 bl __ecp_nistz256_sub_from # p256_sub(res_y, res_y, S2); ld $t0,0($bp_real) # in2 ld $t1,8($bp_real) ld $t2,16($bp_real) ld $t3,24($bp_real) ld $a0,$res_x+0($sp) # res ld $a1,$res_x+8($sp) ld $a2,$res_x+16($sp) ld $a3,$res_x+24($sp) ___ for($i=0;$i<64;$i+=32) { # conditional moves $code.=<<___; ld $acc0,$i+0($ap_real) # in1 ld $acc1,$i+8($ap_real) ld $acc2,$i+16($ap_real) ld $acc3,$i+24($ap_real) andc $t0,$t0,$in1infty andc $t1,$t1,$in1infty andc $t2,$t2,$in1infty andc $t3,$t3,$in1infty and $a0,$a0,$in1infty and $a1,$a1,$in1infty and $a2,$a2,$in1infty and $a3,$a3,$in1infty or $t0,$t0,$a0 or $t1,$t1,$a1 or $t2,$t2,$a2 or $t3,$t3,$a3 andc $acc0,$acc0,$in2infty andc $acc1,$acc1,$in2infty andc $acc2,$acc2,$in2infty andc $acc3,$acc3,$in2infty and $t0,$t0,$in2infty and $t1,$t1,$in2infty and $t2,$t2,$in2infty and $t3,$t3,$in2infty or $acc0,$acc0,$t0 or $acc1,$acc1,$t1 or $acc2,$acc2,$t2 or $acc3,$acc3,$t3 ___ $code.=<<___ if ($i==0); ld $t0,32($bp_real) # in2 ld $t1,40($bp_real) ld $t2,48($bp_real) ld $t3,56($bp_real) ___ $code.=<<___ if ($i==32); li $t0,1 # Lone_mont not $t1,$poly1 li $t2,-1 not $t3,$poly3 ___ $code.=<<___; ld $a0,$res_x+$i+32($sp) ld $a1,$res_x+$i+40($sp) ld $a2,$res_x+$i+48($sp) ld $a3,$res_x+$i+56($sp) std $acc0,$i+0($rp_real) std $acc1,$i+8($rp_real) std $acc2,$i+16($rp_real) std $acc3,$i+24($rp_real) ___ } $code.=<<___; ld $acc0,$i+0($ap_real) # in1 ld $acc1,$i+8($ap_real) ld $acc2,$i+16($ap_real) ld $acc3,$i+24($ap_real) andc $t0,$t0,$in1infty andc $t1,$t1,$in1infty andc $t2,$t2,$in1infty andc $t3,$t3,$in1infty and $a0,$a0,$in1infty and $a1,$a1,$in1infty and $a2,$a2,$in1infty and $a3,$a3,$in1infty or $t0,$t0,$a0 or $t1,$t1,$a1 or $t2,$t2,$a2 or $t3,$t3,$a3 andc $acc0,$acc0,$in2infty andc $acc1,$acc1,$in2infty andc $acc2,$acc2,$in2infty andc $acc3,$acc3,$in2infty and $t0,$t0,$in2infty and $t1,$t1,$in2infty and $t2,$t2,$in2infty and $t3,$t3,$in2infty or $acc0,$acc0,$t0 or $acc1,$acc1,$t1 or $acc2,$acc2,$t2 or $acc3,$acc3,$t3 std $acc0,$i+0($rp_real) std $acc1,$i+8($rp_real) std $acc2,$i+16($rp_real) std $acc3,$i+24($rp_real) mtlr r0 ld r16,$FRAME-8*16($sp) ld r17,$FRAME-8*15($sp) ld r18,$FRAME-8*14($sp) ld r19,$FRAME-8*13($sp) ld r20,$FRAME-8*12($sp) ld r21,$FRAME-8*11($sp) ld r22,$FRAME-8*10($sp) ld r23,$FRAME-8*9($sp) ld r24,$FRAME-8*8($sp) ld r25,$FRAME-8*7($sp) ld r26,$FRAME-8*6($sp) ld r27,$FRAME-8*5($sp) ld r28,$FRAME-8*4($sp) ld r29,$FRAME-8*3($sp) ld r30,$FRAME-8*2($sp) ld r31,$FRAME-8*1($sp) addi $sp,$sp,$FRAME blr .long 0 .byte 0,12,4,0,0x80,16,3,0 .long 0 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine ___ } if (1) { my ($ordk,$ord0,$ord1,$t4) = map("r$_",(18..21)); my ($ord2,$ord3,$zr) = ($poly1,$poly3,"r0"); $code.=<<___; ######################################################################## # void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], # uint64_t b[4]); .globl ecp_nistz256_ord_mul_mont .align 5 ecp_nistz256_ord_mul_mont: stdu $sp,-160($sp) std r18,48($sp) std r19,56($sp) std r20,64($sp) std r21,72($sp) std r22,80($sp) std r23,88($sp) std r24,96($sp) std r25,104($sp) std r26,112($sp) std r27,120($sp) std r28,128($sp) std r29,136($sp) std r30,144($sp) std r31,152($sp) ld $a0,0($ap) ld $bi,0($bp) ld $a1,8($ap) ld $a2,16($ap) ld $a3,24($ap) lis $ordk,0xccd1 lis $ord0,0xf3b9 lis $ord1,0xbce6 ori $ordk,$ordk,0xc8aa ori $ord0,$ord0,0xcac2 ori $ord1,$ord1,0xfaad sldi $ordk,$ordk,32 sldi $ord0,$ord0,32 sldi $ord1,$ord1,32 oris $ordk,$ordk,0xee00 oris $ord0,$ord0,0xfc63 oris $ord1,$ord1,0xa717 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 li $ord2,-1 # 0xffffffffffffffff sldi $ord3,$ord2,32 # 0xffffffff00000000 li $zr,0 mulld $acc0,$a0,$bi # a[0]*b[0] mulhdu $t0,$a0,$bi mulld $acc1,$a1,$bi # a[1]*b[0] mulhdu $t1,$a1,$bi mulld $acc2,$a2,$bi # a[2]*b[0] mulhdu $t2,$a2,$bi mulld $acc3,$a3,$bi # a[3]*b[0] mulhdu $acc4,$a3,$bi mulld $t4,$acc0,$ordk addc $acc1,$acc1,$t0 # accumulate high parts of multiplication adde $acc2,$acc2,$t1 adde $acc3,$acc3,$t2 addze $acc4,$acc4 li $acc5,0 ___ for ($i=1;$i<4;$i++) { ################################################################ # ffff0000.ffffffff.yyyyyyyy.zzzzzzzz # * abcdefgh # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx # # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we # rewrite above as: # # xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx # - 0000abcd.efgh0000.abcdefgh.00000000.00000000 # + abcdefgh.abcdefgh.yzayzbyz.cyzdyzey.zfyzgyzh $code.=<<___; ld $bi,8*$i($bp) # b[i] sldi $t0,$t4,32 subfc $acc2,$t4,$acc2 srdi $t1,$t4,32 subfe $acc3,$t0,$acc3 subfe $acc4,$t1,$acc4 subfe $acc5,$zr,$acc5 addic $t0,$acc0,-1 # discarded mulhdu $t1,$ord0,$t4 mulld $t2,$ord1,$t4 mulhdu $t3,$ord1,$t4 adde $t2,$t2,$t1 mulld $t0,$a0,$bi addze $t3,$t3 mulld $t1,$a1,$bi addc $acc0,$acc1,$t2 mulld $t2,$a2,$bi adde $acc1,$acc2,$t3 mulld $t3,$a3,$bi adde $acc2,$acc3,$t4 adde $acc3,$acc4,$t4 addze $acc4,$acc5 addc $acc0,$acc0,$t0 # accumulate low parts mulhdu $t0,$a0,$bi adde $acc1,$acc1,$t1 mulhdu $t1,$a1,$bi adde $acc2,$acc2,$t2 mulhdu $t2,$a2,$bi adde $acc3,$acc3,$t3 mulhdu $t3,$a3,$bi addze $acc4,$acc4 mulld $t4,$acc0,$ordk addc $acc1,$acc1,$t0 # accumulate high parts adde $acc2,$acc2,$t1 adde $acc3,$acc3,$t2 adde $acc4,$acc4,$t3 addze $acc5,$zr ___ } $code.=<<___; sldi $t0,$t4,32 # last reduction subfc $acc2,$t4,$acc2 srdi $t1,$t4,32 subfe $acc3,$t0,$acc3 subfe $acc4,$t1,$acc4 subfe $acc5,$zr,$acc5 addic $t0,$acc0,-1 # discarded mulhdu $t1,$ord0,$t4 mulld $t2,$ord1,$t4 mulhdu $t3,$ord1,$t4 adde $t2,$t2,$t1 addze $t3,$t3 addc $acc0,$acc1,$t2 adde $acc1,$acc2,$t3 adde $acc2,$acc3,$t4 adde $acc3,$acc4,$t4 addze $acc4,$acc5 subfc $acc0,$ord0,$acc0 # ret -= modulus subfe $acc1,$ord1,$acc1 subfe $acc2,$ord2,$acc2 subfe $acc3,$ord3,$acc3 subfe $acc4,$zr,$acc4 and $t0,$ord0,$acc4 and $t1,$ord1,$acc4 addc $acc0,$acc0,$t0 # ret += modulus if borrow and $t3,$ord3,$acc4 adde $acc1,$acc1,$t1 adde $acc2,$acc2,$acc4 adde $acc3,$acc3,$t3 std $acc0,0($rp) std $acc1,8($rp) std $acc2,16($rp) std $acc3,24($rp) ld r18,48($sp) ld r19,56($sp) ld r20,64($sp) ld r21,72($sp) ld r22,80($sp) ld r23,88($sp) ld r24,96($sp) ld r25,104($sp) ld r26,112($sp) ld r27,120($sp) ld r28,128($sp) ld r29,136($sp) ld r30,144($sp) ld r31,152($sp) addi $sp,$sp,160 blr .long 0 .byte 0,12,4,0,0x80,14,3,0 .long 0 .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont ################################################################################ # void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], # int rep); .globl ecp_nistz256_ord_sqr_mont .align 5 ecp_nistz256_ord_sqr_mont: stdu $sp,-160($sp) std r18,48($sp) std r19,56($sp) std r20,64($sp) std r21,72($sp) std r22,80($sp) std r23,88($sp) std r24,96($sp) std r25,104($sp) std r26,112($sp) std r27,120($sp) std r28,128($sp) std r29,136($sp) std r30,144($sp) std r31,152($sp) mtctr $bp ld $a0,0($ap) ld $a1,8($ap) ld $a2,16($ap) ld $a3,24($ap) lis $ordk,0xccd1 lis $ord0,0xf3b9 lis $ord1,0xbce6 ori $ordk,$ordk,0xc8aa ori $ord0,$ord0,0xcac2 ori $ord1,$ord1,0xfaad sldi $ordk,$ordk,32 sldi $ord0,$ord0,32 sldi $ord1,$ord1,32 oris $ordk,$ordk,0xee00 oris $ord0,$ord0,0xfc63 oris $ord1,$ord1,0xa717 ori $ordk,$ordk,0xbc4f # 0xccd1c8aaee00bc4f ori $ord0,$ord0,0x2551 # 0xf3b9cac2fc632551 ori $ord1,$ord1,0x9e84 # 0xbce6faada7179e84 li $ord2,-1 # 0xffffffffffffffff sldi $ord3,$ord2,32 # 0xffffffff00000000 li $zr,0 b .Loop_ord_sqr .align 5 .Loop_ord_sqr: ################################################################ # | | | | | |a1*a0| | # | | | | |a2*a0| | | # | |a3*a2|a3*a0| | | | # | | | |a2*a1| | | | # | | |a3*a1| | | | | # *| | | | | | | | 2| # +|a3*a3|a2*a2|a1*a1|a0*a0| # |--+--+--+--+--+--+--+--| # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx # # "can't overflow" below mark carrying into high part of # multiplication result, which can't overflow, because it # can never be all ones. mulld $acc1,$a1,$a0 # a[1]*a[0] mulhdu $t1,$a1,$a0 mulld $acc2,$a2,$a0 # a[2]*a[0] mulhdu $t2,$a2,$a0 mulld $acc3,$a3,$a0 # a[3]*a[0] mulhdu $acc4,$a3,$a0 addc $acc2,$acc2,$t1 # accumulate high parts of multiplication mulld $t0,$a2,$a1 # a[2]*a[1] mulhdu $t1,$a2,$a1 adde $acc3,$acc3,$t2 mulld $t2,$a3,$a1 # a[3]*a[1] mulhdu $t3,$a3,$a1 addze $acc4,$acc4 # can't overflow mulld $acc5,$a3,$a2 # a[3]*a[2] mulhdu $acc6,$a3,$a2 addc $t1,$t1,$t2 # accumulate high parts of multiplication mulld $acc0,$a0,$a0 # a[0]*a[0] addze $t2,$t3 # can't overflow addc $acc3,$acc3,$t0 # accumulate low parts of multiplication mulhdu $a0,$a0,$a0 adde $acc4,$acc4,$t1 mulld $t1,$a1,$a1 # a[1]*a[1] adde $acc5,$acc5,$t2 mulhdu $a1,$a1,$a1 addze $acc6,$acc6 # can't overflow addc $acc1,$acc1,$acc1 # acc[1-6]*=2 mulld $t2,$a2,$a2 # a[2]*a[2] adde $acc2,$acc2,$acc2 mulhdu $a2,$a2,$a2 adde $acc3,$acc3,$acc3 mulld $t3,$a3,$a3 # a[3]*a[3] adde $acc4,$acc4,$acc4 mulhdu $a3,$a3,$a3 adde $acc5,$acc5,$acc5 adde $acc6,$acc6,$acc6 addze $acc7,$zr addc $acc1,$acc1,$a0 # +a[i]*a[i] mulld $t4,$acc0,$ordk adde $acc2,$acc2,$t1 adde $acc3,$acc3,$a1 adde $acc4,$acc4,$t2 adde $acc5,$acc5,$a2 adde $acc6,$acc6,$t3 adde $acc7,$acc7,$a3 ___ for($i=0; $i<4; $i++) { # reductions $code.=<<___; addic $t0,$acc0,-1 # discarded mulhdu $t1,$ord0,$t4 mulld $t2,$ord1,$t4 mulhdu $t3,$ord1,$t4 adde $t2,$t2,$t1 addze $t3,$t3 addc $acc0,$acc1,$t2 adde $acc1,$acc2,$t3 adde $acc2,$acc3,$t4 adde $acc3,$zr,$t4 # can't overflow ___ $code.=<<___ if ($i<3); mulld $t3,$acc0,$ordk ___ $code.=<<___; sldi $t0,$t4,32 subfc $acc1,$t4,$acc1 srdi $t1,$t4,32 subfe $acc2,$t0,$acc2 subfe $acc3,$t1,$acc3 # can't borrow ___ ($t3,$t4) = ($t4,$t3); } $code.=<<___; addc $acc0,$acc0,$acc4 # accumulate upper half adde $acc1,$acc1,$acc5 adde $acc2,$acc2,$acc6 adde $acc3,$acc3,$acc7 addze $acc4,$zr subfc $acc0,$ord0,$acc0 # ret -= modulus subfe $acc1,$ord1,$acc1 subfe $acc2,$ord2,$acc2 subfe $acc3,$ord3,$acc3 subfe $acc4,$zr,$acc4 and $t0,$ord0,$acc4 and $t1,$ord1,$acc4 addc $a0,$acc0,$t0 # ret += modulus if borrow and $t3,$ord3,$acc4 adde $a1,$acc1,$t1 adde $a2,$acc2,$acc4 adde $a3,$acc3,$t3 bdnz .Loop_ord_sqr std $a0,0($rp) std $a1,8($rp) std $a2,16($rp) std $a3,24($rp) ld r18,48($sp) ld r19,56($sp) ld r20,64($sp) ld r21,72($sp) ld r22,80($sp) ld r23,88($sp) ld r24,96($sp) ld r25,104($sp) ld r26,112($sp) ld r27,120($sp) ld r28,128($sp) ld r29,136($sp) ld r30,144($sp) ld r31,152($sp) addi $sp,$sp,160 blr .long 0 .byte 0,12,4,0,0x80,14,3,0 .long 0 .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont ___ } } ######################################################################## # scatter-gather subroutines { my ($out,$inp,$index,$mask)=map("r$_",(3..7)); $code.=<<___; ######################################################################## # void ecp_nistz256_scatter_w5(void *out, const P256_POINT *inp, # int index); .globl ecp_nistz256_scatter_w5 .align 4 ecp_nistz256_scatter_w5: slwi $index,$index,2 add $out,$out,$index ld r8, 0($inp) # X ld r9, 8($inp) ld r10,16($inp) ld r11,24($inp) stw r8, 64*0-4($out) srdi r8, r8, 32 stw r9, 64*1-4($out) srdi r9, r9, 32 stw r10,64*2-4($out) srdi r10,r10,32 stw r11,64*3-4($out) srdi r11,r11,32 stw r8, 64*4-4($out) stw r9, 64*5-4($out) stw r10,64*6-4($out) stw r11,64*7-4($out) addi $out,$out,64*8 ld r8, 32($inp) # Y ld r9, 40($inp) ld r10,48($inp) ld r11,56($inp) stw r8, 64*0-4($out) srdi r8, r8, 32 stw r9, 64*1-4($out) srdi r9, r9, 32 stw r10,64*2-4($out) srdi r10,r10,32 stw r11,64*3-4($out) srdi r11,r11,32 stw r8, 64*4-4($out) stw r9, 64*5-4($out) stw r10,64*6-4($out) stw r11,64*7-4($out) addi $out,$out,64*8 ld r8, 64($inp) # Z ld r9, 72($inp) ld r10,80($inp) ld r11,88($inp) stw r8, 64*0-4($out) srdi r8, r8, 32 stw r9, 64*1-4($out) srdi r9, r9, 32 stw r10,64*2-4($out) srdi r10,r10,32 stw r11,64*3-4($out) srdi r11,r11,32 stw r8, 64*4-4($out) stw r9, 64*5-4($out) stw r10,64*6-4($out) stw r11,64*7-4($out) blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 ######################################################################## # void ecp_nistz256_gather_w5(P256_POINT *out, const void *inp, # int index); .globl ecp_nistz256_gather_w5 .align 4 ecp_nistz256_gather_w5: neg r0,$index sradi r0,r0,63 add $index,$index,r0 slwi $index,$index,2 add $inp,$inp,$index lwz r5, 64*0($inp) lwz r6, 64*1($inp) lwz r7, 64*2($inp) lwz r8, 64*3($inp) lwz r9, 64*4($inp) lwz r10,64*5($inp) lwz r11,64*6($inp) lwz r12,64*7($inp) addi $inp,$inp,64*8 sldi r9, r9, 32 sldi r10,r10,32 sldi r11,r11,32 sldi r12,r12,32 or r5,r5,r9 or r6,r6,r10 or r7,r7,r11 or r8,r8,r12 and r5,r5,r0 and r6,r6,r0 and r7,r7,r0 and r8,r8,r0 std r5,0($out) # X std r6,8($out) std r7,16($out) std r8,24($out) lwz r5, 64*0($inp) lwz r6, 64*1($inp) lwz r7, 64*2($inp) lwz r8, 64*3($inp) lwz r9, 64*4($inp) lwz r10,64*5($inp) lwz r11,64*6($inp) lwz r12,64*7($inp) addi $inp,$inp,64*8 sldi r9, r9, 32 sldi r10,r10,32 sldi r11,r11,32 sldi r12,r12,32 or r5,r5,r9 or r6,r6,r10 or r7,r7,r11 or r8,r8,r12 and r5,r5,r0 and r6,r6,r0 and r7,r7,r0 and r8,r8,r0 std r5,32($out) # Y std r6,40($out) std r7,48($out) std r8,56($out) lwz r5, 64*0($inp) lwz r6, 64*1($inp) lwz r7, 64*2($inp) lwz r8, 64*3($inp) lwz r9, 64*4($inp) lwz r10,64*5($inp) lwz r11,64*6($inp) lwz r12,64*7($inp) sldi r9, r9, 32 sldi r10,r10,32 sldi r11,r11,32 sldi r12,r12,32 or r5,r5,r9 or r6,r6,r10 or r7,r7,r11 or r8,r8,r12 and r5,r5,r0 and r6,r6,r0 and r7,r7,r0 and r8,r8,r0 std r5,64($out) # Z std r6,72($out) std r7,80($out) std r8,88($out) blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 ######################################################################## # void ecp_nistz256_scatter_w7(void *out, const P256_POINT_AFFINE *inp, # int index); .globl ecp_nistz256_scatter_w7 .align 4 ecp_nistz256_scatter_w7: li r0,8 mtctr r0 add $out,$out,$index subi $inp,$inp,8 .Loop_scatter_w7: ldu r0,8($inp) stb r0,64*0($out) srdi r0,r0,8 stb r0,64*1($out) srdi r0,r0,8 stb r0,64*2($out) srdi r0,r0,8 stb r0,64*3($out) srdi r0,r0,8 stb r0,64*4($out) srdi r0,r0,8 stb r0,64*5($out) srdi r0,r0,8 stb r0,64*6($out) srdi r0,r0,8 stb r0,64*7($out) addi $out,$out,64*8 bdnz .Loop_scatter_w7 blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 ######################################################################## # void ecp_nistz256_gather_w7(P256_POINT_AFFINE *out, const void *inp, # int index); .globl ecp_nistz256_gather_w7 .align 4 ecp_nistz256_gather_w7: li r0,8 mtctr r0 neg r0,$index sradi r0,r0,63 add $index,$index,r0 add $inp,$inp,$index subi $out,$out,8 .Loop_gather_w7: lbz r5, 64*0($inp) lbz r6, 64*1($inp) lbz r7, 64*2($inp) lbz r8, 64*3($inp) lbz r9, 64*4($inp) lbz r10,64*5($inp) lbz r11,64*6($inp) lbz r12,64*7($inp) addi $inp,$inp,64*8 sldi r6, r6, 8 sldi r7, r7, 16 sldi r8, r8, 24 sldi r9, r9, 32 sldi r10,r10,40 sldi r11,r11,48 sldi r12,r12,56 or r5,r5,r6 or r7,r7,r8 or r9,r9,r10 or r11,r11,r12 or r5,r5,r7 or r9,r9,r11 or r5,r5,r9 and r5,r5,r0 stdu r5,8($out) bdnz .Loop_gather_w7 blr .long 0 .byte 0,12,0x14,0,0,0,3,0 .long 0 .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 ___ } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; print $_,"\n"; } close STDOUT; # enforce flush