openssl/crypto/poly1305/asm/poly1305-ppc.pl
Andy Polyakov 41013cd63c PPC assembly pack: correct POWER9 results.
As it turns out originally published results were skewed by "turbo"
mode. VM apparently remains oblivious to dynamic frequency scaling,
and reports that processor operates at "base" frequency at all times.
While actual frequency gets increased under load.

Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6406)
2018-06-03 21:20:06 +02:00

645 lines
13 KiB
Raku
Executable file

#! /usr/bin/env perl
# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for PowerPC.
#
# June 2015
#
# Numbers are cycles per processed byte with poly1305_blocks alone,
# and improvement coefficients relative to gcc-generated code.
#
# -m32 -m64
#
# Freescale e300 14.8/+80% -
# PPC74x0 7.60/+60% -
# PPC970 7.00/+114% 3.51/+205%
# POWER7 3.75/+260% 1.93/+100%
# POWER8 - 2.03/+200%
# POWER9 - 2.00/+150%
#
# Do we need floating-point implementation for PPC? Results presented
# in poly1305_ieee754.c are tricky to compare to, because they are for
# compiler-generated code. On the other hand it's known that floating-
# point performance can be dominated by FPU latency, which means that
# there is limit even for ideally optimized (and even vectorized) code.
# And this limit is estimated to be higher than above -m64 results. Or
# in other words floating-point implementation can be meaningful to
# consider only in 32-bit application context. We probably have to
# recognize that 32-bit builds are getting less popular on high-end
# systems and therefore tend to target embedded ones, which might not
# even have FPU...
#
# On side note, Power ISA 2.07 enables vector base 2^26 implementation,
# and POWER8 might have capacity to break 1.0 cycle per byte barrier...
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$UCMP ="cmplw";
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
} else { die "nonsense $flavour"; }
# Define endianness based on flavour
# i.e.: linux64le
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=24*$SIZE_T;
$sp="r1";
my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
my ($mac,$nonce)=($inp,$len);
my $mask = "r0";
$code=<<___;
.machine "any"
.text
___
if ($flavour =~ /64/) {
###############################################################################
# base 2^64 implementation
my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
$code.=<<___;
.globl .poly1305_init_int
.align 4
.poly1305_init_int:
xor r0,r0,r0
std r0,0($ctx) # zero hash value
std r0,8($ctx)
std r0,16($ctx)
$UCMP $inp,r0
beq- Lno_key
___
$code.=<<___ if ($LITTLE_ENDIAN);
ld $d0,0($inp) # load key material
ld $d1,8($inp)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $h0,4
lwbrx $d0,0,$inp # load key material
li $d1,8
lwbrx $h0,$h0,$inp
li $h1,12
lwbrx $d1,$d1,$inp
lwbrx $h1,$h1,$inp
insrdi $d0,$h0,32,0
insrdi $d1,$h1,32,0
___
$code.=<<___;
lis $h1,0xfff # 0x0fff0000
ori $h1,$h1,0xfffc # 0x0ffffffc
insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc
ori $h0,$h1,3 # 0x0ffffffc0fffffff
and $d0,$d0,$h0
and $d1,$d1,$h1
std $d0,32($ctx) # store key
std $d1,40($ctx)
Lno_key:
xor r3,r3,r3
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.size .poly1305_init_int,.-.poly1305_init_int
.globl .poly1305_blocks
.align 4
.poly1305_blocks:
srdi. $len,$len,4
beq- Labort
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
ld $r0,32($ctx) # load key
ld $r1,40($ctx)
ld $h0,0($ctx) # load hash value
ld $h1,8($ctx)
ld $h2,16($ctx)
srdi $s1,$r1,2
mtctr $len
add $s1,$s1,$r1 # s1 = r1 + r1>>2
li $mask,3
b Loop
.align 4
Loop:
___
$code.=<<___ if ($LITTLE_ENDIAN);
ld $t0,0($inp) # load input
ld $t1,8($inp)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $d0,4
lwbrx $t0,0,$inp # load input
li $t1,8
lwbrx $d0,$d0,$inp
li $d1,12
lwbrx $t1,$t1,$inp
lwbrx $d1,$d1,$inp
insrdi $t0,$d0,32,0
insrdi $t1,$d1,32,0
___
$code.=<<___;
addi $inp,$inp,16
addc $h0,$h0,$t0 # accumulate input
adde $h1,$h1,$t1
mulld $d0,$h0,$r0 # h0*r0
mulhdu $d1,$h0,$r0
adde $h2,$h2,$padbit
mulld $t0,$h1,$s1 # h1*5*r1
mulhdu $t1,$h1,$s1
addc $d0,$d0,$t0
adde $d1,$d1,$t1
mulld $t0,$h0,$r1 # h0*r1
mulhdu $d2,$h0,$r1
addc $d1,$d1,$t0
addze $d2,$d2
mulld $t0,$h1,$r0 # h1*r0
mulhdu $t1,$h1,$r0
addc $d1,$d1,$t0
adde $d2,$d2,$t1
mulld $t0,$h2,$s1 # h2*5*r1
mulld $t1,$h2,$r0 # h2*r0
addc $d1,$d1,$t0
adde $d2,$d2,$t1
andc $t0,$d2,$mask # final reduction step
and $h2,$d2,$mask
srdi $t1,$t0,2
add $t0,$t0,$t1
addc $h0,$d0,$t0
addze $h1,$d1
addze $h2,$h2
bdnz Loop
std $h0,0($ctx) # store hash value
std $h1,8($ctx)
std $h2,16($ctx)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
addi $sp,$sp,$FRAME
Labort:
blr
.long 0
.byte 0,12,4,1,0x80,5,4,0
.size .poly1305_blocks,.-.poly1305_blocks
.globl .poly1305_emit
.align 4
.poly1305_emit:
ld $h0,0($ctx) # load hash
ld $h1,8($ctx)
ld $h2,16($ctx)
ld $padbit,0($nonce) # load nonce
ld $nonce,8($nonce)
addic $d0,$h0,5 # compare to modulus
addze $d1,$h1
addze $d2,$h2
srdi $mask,$d2,2 # did it carry/borrow?
neg $mask,$mask
andc $h0,$h0,$mask
and $d0,$d0,$mask
andc $h1,$h1,$mask
and $d1,$d1,$mask
or $h0,$h0,$d0
or $h1,$h1,$d1
___
$code.=<<___ if (!$LITTLE_ENDIAN);
rotldi $padbit,$padbit,32 # flip nonce words
rotldi $nonce,$nonce,32
___
$code.=<<___;
addc $h0,$h0,$padbit # accumulate nonce
adde $h1,$h1,$nonce
___
$code.=<<___ if ($LITTLE_ENDIAN);
std $h0,0($mac) # write result
std $h1,8($mac)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
extrdi r0,$h0,32,0
li $d0,4
stwbrx $h0,0,$mac # write result
extrdi $h0,$h1,32,0
li $d1,8
stwbrx r0,$d0,$mac
li $d2,12
stwbrx $h1,$d1,$mac
stwbrx $h0,$d2,$mac
___
$code.=<<___;
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.size .poly1305_emit,.-.poly1305_emit
___
} else {
###############################################################################
# base 2^32 implementation
my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
$t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
) = map("r$_",(7..12,14..31));
$code.=<<___;
.globl .poly1305_init_int
.align 4
.poly1305_init_int:
xor r0,r0,r0
stw r0,0($ctx) # zero hash value
stw r0,4($ctx)
stw r0,8($ctx)
stw r0,12($ctx)
stw r0,16($ctx)
$UCMP $inp,r0
beq- Lno_key
___
$code.=<<___ if ($LITTLE_ENDIAN);
lw $h0,0($inp) # load key material
lw $h1,4($inp)
lw $h2,8($inp)
lw $h3,12($inp)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $h1,4
lwbrx $h0,0,$inp # load key material
li $h2,8
lwbrx $h1,$h1,$inp
li $h3,12
lwbrx $h2,$h2,$inp
lwbrx $h3,$h3,$inp
___
$code.=<<___;
lis $mask,0xf000 # 0xf0000000
li $r0,-4
andc $r0,$r0,$mask # 0x0ffffffc
andc $h0,$h0,$mask
and $h1,$h1,$r0
and $h2,$h2,$r0
and $h3,$h3,$r0
stw $h0,32($ctx) # store key
stw $h1,36($ctx)
stw $h2,40($ctx)
stw $h3,44($ctx)
Lno_key:
xor r3,r3,r3
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.size .poly1305_init_int,.-.poly1305_init_int
.globl .poly1305_blocks
.align 4
.poly1305_blocks:
srwi. $len,$len,4
beq- Labort
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $r0,32($ctx) # load key
lwz $r1,36($ctx)
lwz $r2,40($ctx)
lwz $r3,44($ctx)
lwz $h0,0($ctx) # load hash value
lwz $h1,4($ctx)
lwz $h2,8($ctx)
lwz $h3,12($ctx)
lwz $h4,16($ctx)
srwi $s1,$r1,2
srwi $s2,$r2,2
srwi $s3,$r3,2
add $s1,$s1,$r1 # si = ri + ri>>2
add $s2,$s2,$r2
add $s3,$s3,$r3
mtctr $len
li $mask,3
b Loop
.align 4
Loop:
___
$code.=<<___ if ($LITTLE_ENDIAN);
lwz $d0,0($inp) # load input
lwz $d1,4($inp)
lwz $d2,8($inp)
lwz $d3,12($inp)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $d1,4
lwbrx $d0,0,$inp # load input
li $d2,8
lwbrx $d1,$d1,$inp
li $d3,12
lwbrx $d2,$d2,$inp
lwbrx $d3,$d3,$inp
___
$code.=<<___;
addi $inp,$inp,16
addc $h0,$h0,$d0 # accumulate input
adde $h1,$h1,$d1
adde $h2,$h2,$d2
mullw $d0,$h0,$r0 # h0*r0
mulhwu $D0,$h0,$r0
mullw $d1,$h0,$r1 # h0*r1
mulhwu $D1,$h0,$r1
mullw $d2,$h0,$r2 # h0*r2
mulhwu $D2,$h0,$r2
adde $h3,$h3,$d3
adde $h4,$h4,$padbit
mullw $d3,$h0,$r3 # h0*r3
mulhwu $D3,$h0,$r3
mullw $t0,$h1,$s3 # h1*s3
mulhwu $t1,$h1,$s3
mullw $t2,$h1,$r0 # h1*r0
mulhwu $t3,$h1,$r0
addc $d0,$d0,$t0
adde $D0,$D0,$t1
mullw $t0,$h1,$r1 # h1*r1
mulhwu $t1,$h1,$r1
addc $d1,$d1,$t2
adde $D1,$D1,$t3
mullw $t2,$h1,$r2 # h1*r2
mulhwu $t3,$h1,$r2
addc $d2,$d2,$t0
adde $D2,$D2,$t1
mullw $t0,$h2,$s2 # h2*s2
mulhwu $t1,$h2,$s2
addc $d3,$d3,$t2
adde $D3,$D3,$t3
mullw $t2,$h2,$s3 # h2*s3
mulhwu $t3,$h2,$s3
addc $d0,$d0,$t0
adde $D0,$D0,$t1
mullw $t0,$h2,$r0 # h2*r0
mulhwu $t1,$h2,$r0
addc $d1,$d1,$t2
adde $D1,$D1,$t3
mullw $t2,$h2,$r1 # h2*r1
mulhwu $t3,$h2,$r1
addc $d2,$d2,$t0
adde $D2,$D2,$t1
mullw $t0,$h3,$s1 # h3*s1
mulhwu $t1,$h3,$s1
addc $d3,$d3,$t2
adde $D3,$D3,$t3
mullw $t2,$h3,$s2 # h3*s2
mulhwu $t3,$h3,$s2
addc $d0,$d0,$t0
adde $D0,$D0,$t1
mullw $t0,$h3,$s3 # h3*s3
mulhwu $t1,$h3,$s3
addc $d1,$d1,$t2
adde $D1,$D1,$t3
mullw $t2,$h3,$r0 # h3*r0
mulhwu $t3,$h3,$r0
addc $d2,$d2,$t0
adde $D2,$D2,$t1
mullw $t0,$h4,$s1 # h4*s1
addc $d3,$d3,$t2
adde $D3,$D3,$t3
addc $d1,$d1,$t0
mullw $t1,$h4,$s2 # h4*s2
addze $D1,$D1
addc $d2,$d2,$t1
addze $D2,$D2
mullw $t2,$h4,$s3 # h4*s3
addc $d3,$d3,$t2
addze $D3,$D3
mullw $h4,$h4,$r0 # h4*r0
addc $h1,$d1,$D0
adde $h2,$d2,$D1
adde $h3,$d3,$D2
adde $h4,$h4,$D3
andc $D0,$h4,$mask # final reduction step
and $h4,$h4,$mask
srwi $D1,$D0,2
add $D0,$D0,$D1
addc $h0,$d0,$D0
addze $h1,$h1
addze $h2,$h2
addze $h3,$h3
addze $h4,$h4
bdnz Loop
stw $h0,0($ctx) # store hash value
stw $h1,4($ctx)
stw $h2,8($ctx)
stw $h3,12($ctx)
stw $h4,16($ctx)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
addi $sp,$sp,$FRAME
Labort:
blr
.long 0
.byte 0,12,4,1,0x80,18,4,0
.size .poly1305_blocks,.-.poly1305_blocks
.globl .poly1305_emit
.align 4
.poly1305_emit:
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $h0,0($ctx) # load hash
lwz $h1,4($ctx)
lwz $h2,8($ctx)
lwz $h3,12($ctx)
lwz $h4,16($ctx)
addic $d0,$h0,5 # compare to modulus
addze $d1,$h1
addze $d2,$h2
addze $d3,$h3
addze $mask,$h4
srwi $mask,$mask,2 # did it carry/borrow?
neg $mask,$mask
andc $h0,$h0,$mask
and $d0,$d0,$mask
andc $h1,$h1,$mask
and $d1,$d1,$mask
or $h0,$h0,$d0
lwz $d0,0($nonce) # load nonce
andc $h2,$h2,$mask
and $d2,$d2,$mask
or $h1,$h1,$d1
lwz $d1,4($nonce)
andc $h3,$h3,$mask
and $d3,$d3,$mask
or $h2,$h2,$d2
lwz $d2,8($nonce)
or $h3,$h3,$d3
lwz $d3,12($nonce)
addc $h0,$h0,$d0 # accumulate nonce
adde $h1,$h1,$d1
adde $h2,$h2,$d2
adde $h3,$h3,$d3
___
$code.=<<___ if ($LITTLE_ENDIAN);
stw $h0,0($mac) # write result
stw $h1,4($mac)
stw $h2,8($mac)
stw $h3,12($mac)
___
$code.=<<___ if (!$LITTLE_ENDIAN);
li $d1,4
stwbrx $h0,0,$mac # write result
li $d2,8
stwbrx $h1,$d1,$mac
li $d3,12
stwbrx $h2,$d2,$mac
stwbrx $h3,$d3,$mac
___
$code.=<<___;
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,4,3,0
.size .poly1305_emit,.-.poly1305_emit
___
}
$code.=<<___;
.asciz "Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;