openssl/crypto/poly1305/asm/poly1305-ppc.pl

#! /usr/bin/env perl
# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for PowerPC.
#
# June 2015
#
# Numbers are cycles per processed byte with poly1305_blocks alone,
# and improvement coefficients relative to gcc-generated code.
#
#			-m32		-m64
#
# Freescale e300	14.8/+80%	-
# PPC74x0		7.60/+60%	-
# PPC970		7.00/+114%	3.51/+205%
# POWER7		3.75/+260%	1.93/+100%
# POWER8		-		2.03/+200%
# POWER9		-		2.00/+150%
#
# Do we need floating-point implementation for PPC? Results presented
# in poly1305_ieee754.c are tricky to compare to, because they are for
# compiler-generated code. On the other hand it's known that floating-
# point performance can be dominated by FPU latency, which means that
# there is limit even for ideally optimized (and even vectorized) code.
# And this limit is estimated to be higher than above -m64 results. Or
# in other words floating-point implementation can be meaningful to
# consider only in 32-bit application context. We probably have to
# recognize that 32-bit builds are getting less popular on high-end
# systems and therefore tend to target embedded ones, which might not
# even have FPU...
#
# On side note, Power ISA 2.07 enables vector base 2^26 implementation,
# and POWER8 might have capacity to break 1.0 cycle per byte barrier...

$flavour = shift;

if ($flavour =~ /64/) {
	$SIZE_T	=8;
	$LRSAVE	=2*$SIZE_T;
	$UCMP	="cmpld";
	$STU	="stdu";
	$POP	="ld";
	$PUSH	="std";
} elsif ($flavour =~ /32/) {
	$SIZE_T	=4;
	$LRSAVE	=$SIZE_T;
	$UCMP	="cmplw";
	$STU	="stwu";
	$POP	="lwz";
	$PUSH	="stw";
} else { die "nonsense $flavour"; }

# Define endianness based on flavour
# i.e.: linux64le
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";

open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";

$FRAME=24*$SIZE_T;

$sp="r1";
my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
my ($mac,$nonce)=($inp,$len);
my $mask = "r0";

$code=<<___;
.machine	"any"
.text
___
							if ($flavour =~ /64/) {
###############################################################################
# base 2^64 implementation

my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));

$code.=<<___;
.globl	.poly1305_init_int
.align	4
.poly1305_init_int:
	xor	r0,r0,r0
	std	r0,0($ctx)		# zero hash value
	std	r0,8($ctx)
	std	r0,16($ctx)

	$UCMP	$inp,r0
	beq-	Lno_key
___
$code.=<<___	if ($LITTLE_ENDIAN);
	ld	$d0,0($inp)		# load key material
	ld	$d1,8($inp)
___
$code.=<<___	if (!$LITTLE_ENDIAN);
	li	$h0,4
	lwbrx	$d0,0,$inp		# load key material
	li	$d1,8
	lwbrx	$h0,$h0,$inp
	li	$h1,12
	lwbrx	$d1,$d1,$inp
	lwbrx	$h1,$h1,$inp
	insrdi	$d0,$h0,32,0
	insrdi	$d1,$h1,32,0
___
$code.=<<___;
	lis	$h1,0xfff		# 0x0fff0000
	ori	$h1,$h1,0xfffc		# 0x0ffffffc
	insrdi	$h1,$h1,32,0		# 0x0ffffffc0ffffffc
	ori	$h0,$h1,3		# 0x0ffffffc0fffffff

	and	$d0,$d0,$h0
	and	$d1,$d1,$h1

	std	$d0,32($ctx)		# store key
	std	$d1,40($ctx)

Lno_key:
	xor	r3,r3,r3
	blr
	.long	0
	.byte	0,12,0x14,0,0,0,2,0
.size	.poly1305_init_int,.-.poly1305_init_int

.globl	.poly1305_blocks
.align	4
.poly1305_blocks:
	srdi.	$len,$len,4
	beq-	Labort

	$STU	$sp,-$FRAME($sp)
	mflr	r0
	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
	$PUSH	r0,`$FRAME+$LRSAVE`($sp)

	ld	$r0,32($ctx)		# load key
	ld	$r1,40($ctx)

	ld	$h0,0($ctx)		# load hash value
	ld	$h1,8($ctx)
	ld	$h2,16($ctx)

	srdi	$s1,$r1,2
	mtctr	$len
	add	$s1,$s1,$r1		# s1 = r1 + r1>>2
	li	$mask,3
	b	Loop

.align	4
Loop:
___
$code.=<<___	if ($LITTLE_ENDIAN);
	ld	$t0,0($inp)		# load input
	ld	$t1,8($inp)
___
$code.=<<___	if (!$LITTLE_ENDIAN);
	li	$d0,4
	lwbrx	$t0,0,$inp		# load input
	li	$t1,8
	lwbrx	$d0,$d0,$inp
	li	$d1,12
	lwbrx	$t1,$t1,$inp
	lwbrx	$d1,$d1,$inp
	insrdi	$t0,$d0,32,0
	insrdi	$t1,$d1,32,0
___
$code.=<<___;
	addi	$inp,$inp,16

	addc	$h0,$h0,$t0		# accumulate input
	adde	$h1,$h1,$t1

	mulld	$d0,$h0,$r0		# h0*r0
	mulhdu	$d1,$h0,$r0
	adde	$h2,$h2,$padbit

	mulld	$t0,$h1,$s1		# h1*5*r1
	mulhdu	$t1,$h1,$s1
	addc	$d0,$d0,$t0
	adde	$d1,$d1,$t1

	mulld	$t0,$h0,$r1		# h0*r1
	mulhdu	$d2,$h0,$r1
	addc	$d1,$d1,$t0
	addze	$d2,$d2

	mulld	$t0,$h1,$r0		# h1*r0
	mulhdu	$t1,$h1,$r0
	addc	$d1,$d1,$t0
	adde	$d2,$d2,$t1

	mulld	$t0,$h2,$s1		# h2*5*r1
	mulld	$t1,$h2,$r0		# h2*r0
	addc	$d1,$d1,$t0
	adde	$d2,$d2,$t1

	andc	$t0,$d2,$mask		# final reduction step
	and	$h2,$d2,$mask
	srdi	$t1,$t0,2
	add	$t0,$t0,$t1
	addc	$h0,$d0,$t0
	addze	$h1,$d1
	addze	$h2,$h2

	bdnz	Loop

	std	$h0,0($ctx)		# store hash value
	std	$h1,8($ctx)
	std	$h2,16($ctx)

	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
	addi	$sp,$sp,$FRAME
Labort:
	blr
	.long	0
	.byte	0,12,4,1,0x80,5,4,0
.size	.poly1305_blocks,.-.poly1305_blocks

.globl	.poly1305_emit
.align	4
.poly1305_emit:
	ld	$h0,0($ctx)		# load hash
	ld	$h1,8($ctx)
	ld	$h2,16($ctx)
	ld	$padbit,0($nonce)	# load nonce
	ld	$nonce,8($nonce)

	addic	$d0,$h0,5		# compare to modulus
	addze	$d1,$h1
	addze	$d2,$h2

	srdi	$mask,$d2,2		# did it carry/borrow?
	neg	$mask,$mask

	andc	$h0,$h0,$mask
	and	$d0,$d0,$mask
	andc	$h1,$h1,$mask
	and	$d1,$d1,$mask
	or	$h0,$h0,$d0
	or	$h1,$h1,$d1
___
$code.=<<___	if (!$LITTLE_ENDIAN);
	rotldi	$padbit,$padbit,32	# flip nonce words
	rotldi	$nonce,$nonce,32
___
$code.=<<___;
	addc	$h0,$h0,$padbit		# accumulate nonce
	adde	$h1,$h1,$nonce
___
$code.=<<___	if ($LITTLE_ENDIAN);
	std	$h0,0($mac)		# write result
	std	$h1,8($mac)
___
$code.=<<___	if (!$LITTLE_ENDIAN);
	extrdi	r0,$h0,32,0
	li	$d0,4
	stwbrx	$h0,0,$mac		# write result
	extrdi	$h0,$h1,32,0
	li	$d1,8
	stwbrx	r0,$d0,$mac
	li	$d2,12
	stwbrx	$h1,$d1,$mac
	stwbrx	$h0,$d2,$mac
___
$code.=<<___;
	blr
	.long	0
	.byte	0,12,0x14,0,0,0,3,0
.size	.poly1305_emit,.-.poly1305_emit
___
							} else {
###############################################################################
# base 2^32 implementation

my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
    $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
   ) = map("r$_",(7..12,14..31));

$code.=<<___;
.globl	.poly1305_init_int
.align	4
.poly1305_init_int:
	xor	r0,r0,r0
	stw	r0,0($ctx)		# zero hash value
	stw	r0,4($ctx)
	stw	r0,8($ctx)
	stw	r0,12($ctx)
	stw	r0,16($ctx)

	$UCMP	$inp,r0
	beq-	Lno_key
___
$code.=<<___	if ($LITTLE_ENDIAN);
	lw	$h0,0($inp)		# load key material
	lw	$h1,4($inp)
	lw	$h2,8($inp)
	lw	$h3,12($inp)
___
$code.=<<___	if (!$LITTLE_ENDIAN);
	li	$h1,4
	lwbrx	$h0,0,$inp		# load key material
	li	$h2,8
	lwbrx	$h1,$h1,$inp
	li	$h3,12
	lwbrx	$h2,$h2,$inp
	lwbrx	$h3,$h3,$inp
___
$code.=<<___;
	lis	$mask,0xf000		# 0xf0000000
	li	$r0,-4
	andc	$r0,$r0,$mask		# 0x0ffffffc

	andc	$h0,$h0,$mask
	and	$h1,$h1,$r0
	and	$h2,$h2,$r0
	and	$h3,$h3,$r0

	stw	$h0,32($ctx)		# store key
	stw	$h1,36($ctx)
	stw	$h2,40($ctx)
	stw	$h3,44($ctx)

Lno_key:
	xor	r3,r3,r3
	blr
	.long	0
	.byte	0,12,0x14,0,0,0,2,0
.size	.poly1305_init_int,.-.poly1305_init_int

.globl	.poly1305_blocks
.align	4
.poly1305_blocks:
	srwi.	$len,$len,4
	beq-	Labort

	$STU	$sp,-$FRAME($sp)
	mflr	r0
	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
	$PUSH	r0,`$FRAME+$LRSAVE`($sp)

	lwz	$r0,32($ctx)		# load key
	lwz	$r1,36($ctx)
	lwz	$r2,40($ctx)
	lwz	$r3,44($ctx)

	lwz	$h0,0($ctx)		# load hash value
	lwz	$h1,4($ctx)
	lwz	$h2,8($ctx)
	lwz	$h3,12($ctx)
	lwz	$h4,16($ctx)

	srwi	$s1,$r1,2
	srwi	$s2,$r2,2
	srwi	$s3,$r3,2
	add	$s1,$s1,$r1		# si = ri + ri>>2
	add	$s2,$s2,$r2
	add	$s3,$s3,$r3
	mtctr	$len
	li	$mask,3
	b	Loop

.align	4
Loop:
___
$code.=<<___	if ($LITTLE_ENDIAN);
	lwz	$d0,0($inp)		# load input
	lwz	$d1,4($inp)
	lwz	$d2,8($inp)
	lwz	$d3,12($inp)
___
$code.=<<___	if (!$LITTLE_ENDIAN);
	li	$d1,4
	lwbrx	$d0,0,$inp		# load input
	li	$d2,8
	lwbrx	$d1,$d1,$inp
	li	$d3,12
	lwbrx	$d2,$d2,$inp
	lwbrx	$d3,$d3,$inp
___
$code.=<<___;
	addi	$inp,$inp,16

	addc	$h0,$h0,$d0		# accumulate input
	adde	$h1,$h1,$d1
	adde	$h2,$h2,$d2

	mullw	$d0,$h0,$r0		# h0*r0
	mulhwu	$D0,$h0,$r0

	mullw	$d1,$h0,$r1		# h0*r1
	mulhwu	$D1,$h0,$r1

	mullw	$d2,$h0,$r2		# h0*r2
	mulhwu	$D2,$h0,$r2

	 adde	$h3,$h3,$d3
	 adde	$h4,$h4,$padbit

	mullw	$d3,$h0,$r3		# h0*r3
	mulhwu	$D3,$h0,$r3

	mullw	$t0,$h1,$s3		# h1*s3
	mulhwu	$t1,$h1,$s3

	mullw	$t2,$h1,$r0		# h1*r0
	mulhwu	$t3,$h1,$r0
	 addc	$d0,$d0,$t0
	 adde	$D0,$D0,$t1

	mullw	$t0,$h1,$r1		# h1*r1
	mulhwu	$t1,$h1,$r1
	 addc	$d1,$d1,$t2
	 adde	$D1,$D1,$t3

	mullw	$t2,$h1,$r2		# h1*r2
	mulhwu	$t3,$h1,$r2
	 addc	$d2,$d2,$t0
	 adde	$D2,$D2,$t1

	mullw	$t0,$h2,$s2		# h2*s2
	mulhwu	$t1,$h2,$s2
	 addc	$d3,$d3,$t2
	 adde	$D3,$D3,$t3

	mullw	$t2,$h2,$s3		# h2*s3
	mulhwu	$t3,$h2,$s3
	 addc	$d0,$d0,$t0
	 adde	$D0,$D0,$t1

	mullw	$t0,$h2,$r0		# h2*r0
	mulhwu	$t1,$h2,$r0
	 addc	$d1,$d1,$t2
	 adde	$D1,$D1,$t3

	mullw	$t2,$h2,$r1		# h2*r1
	mulhwu	$t3,$h2,$r1
	 addc	$d2,$d2,$t0
	 adde	$D2,$D2,$t1

	mullw	$t0,$h3,$s1		# h3*s1
	mulhwu	$t1,$h3,$s1
	 addc	$d3,$d3,$t2
	 adde	$D3,$D3,$t3

	mullw	$t2,$h3,$s2		# h3*s2
	mulhwu	$t3,$h3,$s2
	 addc	$d0,$d0,$t0
	 adde	$D0,$D0,$t1

	mullw	$t0,$h3,$s3		# h3*s3
	mulhwu	$t1,$h3,$s3
	 addc	$d1,$d1,$t2
	 adde	$D1,$D1,$t3

	mullw	$t2,$h3,$r0		# h3*r0
	mulhwu	$t3,$h3,$r0
	 addc	$d2,$d2,$t0
	 adde	$D2,$D2,$t1

	mullw	$t0,$h4,$s1		# h4*s1
	 addc	$d3,$d3,$t2
	 adde	$D3,$D3,$t3
	addc	$d1,$d1,$t0

	mullw	$t1,$h4,$s2		# h4*s2
	 addze	$D1,$D1
	addc	$d2,$d2,$t1
	addze	$D2,$D2

	mullw	$t2,$h4,$s3		# h4*s3
	addc	$d3,$d3,$t2
	addze	$D3,$D3

	mullw	$h4,$h4,$r0		# h4*r0

	addc	$h1,$d1,$D0
	adde	$h2,$d2,$D1
	adde	$h3,$d3,$D2
	adde	$h4,$h4,$D3

	andc	$D0,$h4,$mask		# final reduction step
	and	$h4,$h4,$mask
	srwi	$D1,$D0,2
	add	$D0,$D0,$D1
	addc	$h0,$d0,$D0
	addze	$h1,$h1
	addze	$h2,$h2
	addze	$h3,$h3
	addze	$h4,$h4

	bdnz	Loop

	stw	$h0,0($ctx)		# store hash value
	stw	$h1,4($ctx)
	stw	$h2,8($ctx)
	stw	$h3,12($ctx)
	stw	$h4,16($ctx)

	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
	addi	$sp,$sp,$FRAME
Labort:
	blr
	.long	0
	.byte	0,12,4,1,0x80,18,4,0
.size	.poly1305_blocks,.-.poly1305_blocks

.globl	.poly1305_emit
.align	4
.poly1305_emit:
	$STU	$sp,-$FRAME($sp)
	mflr	r0
	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
	$PUSH	r0,`$FRAME+$LRSAVE`($sp)

	lwz	$h0,0($ctx)		# load hash
	lwz	$h1,4($ctx)
	lwz	$h2,8($ctx)
	lwz	$h3,12($ctx)
	lwz	$h4,16($ctx)

	addic	$d0,$h0,5		# compare to modulus
	addze	$d1,$h1
	addze	$d2,$h2
	addze	$d3,$h3
	addze	$mask,$h4

	srwi	$mask,$mask,2		# did it carry/borrow?
	neg	$mask,$mask

	andc	$h0,$h0,$mask
	and	$d0,$d0,$mask
	andc	$h1,$h1,$mask
	and	$d1,$d1,$mask
	or	$h0,$h0,$d0
	lwz	$d0,0($nonce)		# load nonce
	andc	$h2,$h2,$mask
	and	$d2,$d2,$mask
	or	$h1,$h1,$d1
	lwz	$d1,4($nonce)
	andc	$h3,$h3,$mask
	and	$d3,$d3,$mask
	or	$h2,$h2,$d2
	lwz	$d2,8($nonce)
	or	$h3,$h3,$d3
	lwz	$d3,12($nonce)

	addc	$h0,$h0,$d0		# accumulate nonce
	adde	$h1,$h1,$d1
	adde	$h2,$h2,$d2
	adde	$h3,$h3,$d3
___
$code.=<<___	if ($LITTLE_ENDIAN);
	stw	$h0,0($mac)		# write result
	stw	$h1,4($mac)
	stw	$h2,8($mac)
	stw	$h3,12($mac)
___
$code.=<<___	if (!$LITTLE_ENDIAN);
	li	$d1,4
	stwbrx	$h0,0,$mac		# write result
	li	$d2,8
	stwbrx	$h1,$d1,$mac
	li	$d3,12
	stwbrx	$h2,$d2,$mac
	stwbrx	$h3,$d3,$mac
___
$code.=<<___;
	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
	addi	$sp,$sp,$FRAME
	blr
	.long	0
	.byte	0,12,4,1,0x80,4,3,0
.size	.poly1305_emit,.-.poly1305_emit
___
							}
$code.=<<___;
.asciz	"Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___

$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;
Add OpenSSL copyright to .pl files Reviewed-by: Richard Levitte <levitte@openssl.org> 2016-05-21 12:23:39 +00:00			`#! /usr/bin/env perl`
Update copyright year Reviewed-by: Richard Levitte <levitte@openssl.org> (Merged from https://github.com/openssl/openssl/pull/6371) 2018-05-29 12:07:08 +00:00			`# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.`
Add OpenSSL copyright to .pl files Reviewed-by: Richard Levitte <levitte@openssl.org> 2016-05-21 12:23:39 +00:00			`#`
			`# Licensed under the OpenSSL license (the "License"). You may not use`
			`# this file except in compliance with the License. You can obtain a copy`
			`# in the file LICENSE in the source distribution or at`
			`# https://www.openssl.org/source/license.html`

PPC assembly pack: add ChaCha20 and Poly1305 modules. Reviewed-by: Richard Levitte <levitte@openssl.org> 2016-02-10 10:51:23 +00:00			`#`
			`# ====================================================================`
			`# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL`
			`# project. The module is, however, dual licensed under OpenSSL and`
			`# CRYPTOGAMS licenses depending on where you obtain it. For further`
			`# details see http://www.openssl.org/~appro/cryptogams/.`
			`# ====================================================================`
			`#`
			`# This module implements Poly1305 hash for PowerPC.`
			`#`
			`# June 2015`
			`#`
			`# Numbers are cycles per processed byte with poly1305_blocks alone,`
			`# and improvement coefficients relative to gcc-generated code.`
			`#`
			`# -m32 -m64`
			`#`
			`# Freescale e300 14.8/+80% -`
crypto/poly1305: don't break carry chains. RT#4483 [poly1305-armv4.pl: remove redundant #ifdef __thumb2__] [poly1305-ppc*.pl: presumably more accurate benchmark results] Reviewed-by: Richard Levitte <levitte@openssl.org> 2016-03-29 08:02:45 +00:00			`# PPC74x0 7.60/+60% -`
			`# PPC970 7.00/+114% 3.51/+205%`
			`# POWER7 3.75/+260% 1.93/+100%`
			`# POWER8 - 2.03/+200%`
PPC assembly pack: correct POWER9 results. As it turns out originally published results were skewed by "turbo" mode. VM apparently remains oblivious to dynamic frequency scaling, and reports that processor operates at "base" frequency at all times. While actual frequency gets increased under load. Reviewed-by: Rich Salz <rsalz@openssl.org> (Merged from https://github.com/openssl/openssl/pull/6406) 2018-06-02 12:03:27 +00:00			`# POWER9 - 2.00/+150%`
PPC assembly pack: add ChaCha20 and Poly1305 modules. Reviewed-by: Richard Levitte <levitte@openssl.org> 2016-02-10 10:51:23 +00:00			`#`
			`# Do we need floating-point implementation for PPC? Results presented`
			`# in poly1305_ieee754.c are tricky to compare to, because they are for`
			`# compiler-generated code. On the other hand it's known that floating-`
			`# point performance can be dominated by FPU latency, which means that`
			`# there is limit even for ideally optimized (and even vectorized) code.`
			`# And this limit is estimated to be higher than above -m64 results. Or`
			`# in other words floating-point implementation can be meaningful to`
			`# consider only in 32-bit application context. We probably have to`
			`# recognize that 32-bit builds are getting less popular on high-end`
			`# systems and therefore tend to target embedded ones, which might not`
			`# even have FPU...`
			`#`
			`# On side note, Power ISA 2.07 enables vector base 2^26 implementation,`
			`# and POWER8 might have capacity to break 1.0 cycle per byte barrier...`

			`$flavour = shift;`

			`if ($flavour =~ /64/) {`
			`$SIZE_T =8;`
			`$LRSAVE =2*$SIZE_T;`
			`$UCMP ="cmpld";`
			`$STU ="stdu";`
			`$POP ="ld";`
			`$PUSH ="std";`
			`} elsif ($flavour =~ /32/) {`
			`$SIZE_T =4;`
			`$LRSAVE =$SIZE_T;`
			`$UCMP ="cmplw";`
			`$STU ="stwu";`
			`$POP ="lwz";`
			`$PUSH ="stw";`
			`} else { die "nonsense $flavour"; }`

spelling fixes, just comments and readme. Reviewed-by: Matt Caswell <matt@openssl.org> Reviewed-by: Rich Salz <rsalz@openssl.org> (Merged from https://github.com/openssl/openssl/pull/1413) 2016-08-05 17:56:58 +00:00			`# Define endianness based on flavour`
PPC assembly pack: add ChaCha20 and Poly1305 modules. Reviewed-by: Richard Levitte <levitte@openssl.org> 2016-02-10 10:51:23 +00:00			`# i.e.: linux64le`
			`$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;`

			`$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;`
			`( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or`
			`( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or`
			`die "can't locate ppc-xlate.pl";`

			`open STDOUT,"\| $^X $xlate $flavour ".shift \|\| die "can't call $xlate: $!";`

			`$FRAME=24*$SIZE_T;`

			`$sp="r1";`
			`my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));`
			`my ($mac,$nonce)=($inp,$len);`
			`my $mask = "r0";`

			`$code=<<___;`
			`.machine "any"`
			`.text`
			`___`
			`if ($flavour =~ /64/) {`
			`###############################################################################`
			`# base 2^64 implementation`

			`my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));`

			`$code.=<<___;`
			`.globl .poly1305_init_int`
			`.align 4`
			`.poly1305_init_int:`
			`xor r0,r0,r0`
			`std r0,0($ctx) # zero hash value`
			`std r0,8($ctx)`
			`std r0,16($ctx)`

			`$UCMP $inp,r0`
			`beq- Lno_key`
			`___`
			`$code.=<<___ if ($LITTLE_ENDIAN);`
			`ld $d0,0($inp) # load key material`
			`ld $d1,8($inp)`
			`___`
			`$code.=<<___ if (!$LITTLE_ENDIAN);`
			`li $h0,4`
			`lwbrx $d0,0,$inp # load key material`
			`li $d1,8`
			`lwbrx $h0,$h0,$inp`
			`li $h1,12`
			`lwbrx $d1,$d1,$inp`
			`lwbrx $h1,$h1,$inp`
			`insrdi $d0,$h0,32,0`
			`insrdi $d1,$h1,32,0`
			`___`
			`$code.=<<___;`
			`lis $h1,0xfff # 0x0fff0000`
			`ori $h1,$h1,0xfffc # 0x0ffffffc`
			`insrdi $h1,$h1,32,0 # 0x0ffffffc0ffffffc`
			`ori $h0,$h1,3 # 0x0ffffffc0fffffff`

			`and $d0,$d0,$h0`
			`and $d1,$d1,$h1`

			`std $d0,32($ctx) # store key`
			`std $d1,40($ctx)`

			`Lno_key:`
			`xor r3,r3,r3`
			`blr`
			`.long 0`
			`.byte 0,12,0x14,0,0,0,2,0`
			`.size .poly1305_init_int,.-.poly1305_init_int`

			`.globl .poly1305_blocks`
			`.align 4`
			`.poly1305_blocks:`
			`srdi. $len,$len,4`
			`beq- Labort`

			`$STU $sp,-$FRAME($sp)`
			`mflr r0`
			$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
			$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
			$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
			$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
			$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
			$PUSH r0,`$FRAME+$LRSAVE`($sp)

			`ld $r0,32($ctx) # load key`
			`ld $r1,40($ctx)`

			`ld $h0,0($ctx) # load hash value`
			`ld $h1,8($ctx)`
			`ld $h2,16($ctx)`

			`srdi $s1,$r1,2`
			`mtctr $len`
			`add $s1,$s1,$r1 # s1 = r1 + r1>>2`
			`li $mask,3`
			`b Loop`

			`.align 4`
			`Loop:`
			`___`
			`$code.=<<___ if ($LITTLE_ENDIAN);`
			`ld $t0,0($inp) # load input`
			`ld $t1,8($inp)`
			`___`
			`$code.=<<___ if (!$LITTLE_ENDIAN);`
			`li $d0,4`
			`lwbrx $t0,0,$inp # load input`
			`li $t1,8`
			`lwbrx $d0,$d0,$inp`
			`li $d1,12`
			`lwbrx $t1,$t1,$inp`
			`lwbrx $d1,$d1,$inp`
			`insrdi $t0,$d0,32,0`
			`insrdi $t1,$d1,32,0`
			`___`
			`$code.=<<___;`
			`addi $inp,$inp,16`

			`addc $h0,$h0,$t0 # accumulate input`
			`adde $h1,$h1,$t1`

			`mulld $d0,$h0,$r0 # h0*r0`
			`mulhdu $d1,$h0,$r0`
			`adde $h2,$h2,$padbit`

			`mulld $t0,$h1,$s1 # h15r1`
			`mulhdu $t1,$h1,$s1`
			`addc $d0,$d0,$t0`
			`adde $d1,$d1,$t1`

			`mulld $t0,$h0,$r1 # h0*r1`
			`mulhdu $d2,$h0,$r1`
			`addc $d1,$d1,$t0`
			`addze $d2,$d2`

			`mulld $t0,$h1,$r0 # h1*r0`
			`mulhdu $t1,$h1,$r0`
			`addc $d1,$d1,$t0`
			`adde $d2,$d2,$t1`

			`mulld $t0,$h2,$s1 # h25r1`
			`mulld $t1,$h2,$r0 # h2*r0`
			`addc $d1,$d1,$t0`
			`adde $d2,$d2,$t1`

			`andc $t0,$d2,$mask # final reduction step`
			`and $h2,$d2,$mask`
			`srdi $t1,$t0,2`
			`add $t0,$t0,$t1`
			`addc $h0,$d0,$t0`
			`addze $h1,$d1`
crypto/poly1305: don't break carry chains. RT#4483 [poly1305-armv4.pl: remove redundant #ifdef __thumb2__] [poly1305-ppc*.pl: presumably more accurate benchmark results] Reviewed-by: Richard Levitte <levitte@openssl.org> 2016-03-29 08:02:45 +00:00			`addze $h2,$h2`
PPC assembly pack: add ChaCha20 and Poly1305 modules. Reviewed-by: Richard Levitte <levitte@openssl.org> 2016-02-10 10:51:23 +00:00
			`bdnz Loop`

			`std $h0,0($ctx) # store hash value`
			`std $h1,8($ctx)`
			`std $h2,16($ctx)`

			$POP r27,`$FRAME-$SIZE_T*5`($sp)
			$POP r28,`$FRAME-$SIZE_T*4`($sp)
			$POP r29,`$FRAME-$SIZE_T*3`($sp)
			$POP r30,`$FRAME-$SIZE_T*2`($sp)
			$POP r31,`$FRAME-$SIZE_T*1`($sp)
			`addi $sp,$sp,$FRAME`
			`Labort:`
			`blr`
			`.long 0`
			`.byte 0,12,4,1,0x80,5,4,0`
			`.size .poly1305_blocks,.-.poly1305_blocks`

			`.globl .poly1305_emit`
			`.align 4`
			`.poly1305_emit:`
			`ld $h0,0($ctx) # load hash`
			`ld $h1,8($ctx)`
			`ld $h2,16($ctx)`
			`ld $padbit,0($nonce) # load nonce`
			`ld $nonce,8($nonce)`

			`addic $d0,$h0,5 # compare to modulus`
			`addze $d1,$h1`
			`addze $d2,$h2`

			`srdi $mask,$d2,2 # did it carry/borrow?`
			`neg $mask,$mask`

			`andc $h0,$h0,$mask`
			`and $d0,$d0,$mask`
			`andc $h1,$h1,$mask`
			`and $d1,$d1,$mask`
			`or $h0,$h0,$d0`
			`or $h1,$h1,$d1`
			`___`
			`$code.=<<___ if (!$LITTLE_ENDIAN);`
			`rotldi $padbit,$padbit,32 # flip nonce words`
			`rotldi $nonce,$nonce,32`
			`___`
			`$code.=<<___;`
			`addc $h0,$h0,$padbit # accumulate nonce`
			`adde $h1,$h1,$nonce`
			`___`
			`$code.=<<___ if ($LITTLE_ENDIAN);`
			`std $h0,0($mac) # write result`
			`std $h1,8($mac)`
			`___`
			`$code.=<<___ if (!$LITTLE_ENDIAN);`
			`extrdi r0,$h0,32,0`
			`li $d0,4`
			`stwbrx $h0,0,$mac # write result`
			`extrdi $h0,$h1,32,0`
			`li $d1,8`
			`stwbrx r0,$d0,$mac`
			`li $d2,12`
			`stwbrx $h1,$d1,$mac`
			`stwbrx $h0,$d2,$mac`
			`___`
			`$code.=<<___;`
			`blr`
			`.long 0`
			`.byte 0,12,0x14,0,0,0,3,0`
			`.size .poly1305_emit,.-.poly1305_emit`
			`___`
			`} else {`
			`###############################################################################`
			`# base 2^32 implementation`

			`my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,`
			`$t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3`
			`) = map("r$_",(7..12,14..31));`

			`$code.=<<___;`
			`.globl .poly1305_init_int`
			`.align 4`
			`.poly1305_init_int:`
			`xor r0,r0,r0`
			`stw r0,0($ctx) # zero hash value`
			`stw r0,4($ctx)`
			`stw r0,8($ctx)`
			`stw r0,12($ctx)`
			`stw r0,16($ctx)`

			`$UCMP $inp,r0`
			`beq- Lno_key`
			`___`
			`$code.=<<___ if ($LITTLE_ENDIAN);`
			`lw $h0,0($inp) # load key material`
			`lw $h1,4($inp)`
			`lw $h2,8($inp)`
			`lw $h3,12($inp)`
			`___`
			`$code.=<<___ if (!$LITTLE_ENDIAN);`
			`li $h1,4`
			`lwbrx $h0,0,$inp # load key material`
			`li $h2,8`
			`lwbrx $h1,$h1,$inp`
			`li $h3,12`
			`lwbrx $h2,$h2,$inp`
			`lwbrx $h3,$h3,$inp`
			`___`
			`$code.=<<___;`
			`lis $mask,0xf000 # 0xf0000000`
			`li $r0,-4`
			`andc $r0,$r0,$mask # 0x0ffffffc`

			`andc $h0,$h0,$mask`
			`and $h1,$h1,$r0`
			`and $h2,$h2,$r0`
			`and $h3,$h3,$r0`

			`stw $h0,32($ctx) # store key`
			`stw $h1,36($ctx)`
			`stw $h2,40($ctx)`
			`stw $h3,44($ctx)`

			`Lno_key:`
			`xor r3,r3,r3`
			`blr`
			`.long 0`
			`.byte 0,12,0x14,0,0,0,2,0`
			`.size .poly1305_init_int,.-.poly1305_init_int`

			`.globl .poly1305_blocks`
			`.align 4`
			`.poly1305_blocks:`
			`srwi. $len,$len,4`
			`beq- Labort`

			`$STU $sp,-$FRAME($sp)`
			`mflr r0`
			$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
			$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
			$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
			$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
			$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
			$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
			$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
			$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
			$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
			$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
			$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
			$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
			$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
			$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
			$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
			$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
			$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
			$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
			$PUSH r0,`$FRAME+$LRSAVE`($sp)

			`lwz $r0,32($ctx) # load key`
			`lwz $r1,36($ctx)`
			`lwz $r2,40($ctx)`
			`lwz $r3,44($ctx)`

			`lwz $h0,0($ctx) # load hash value`
			`lwz $h1,4($ctx)`
			`lwz $h2,8($ctx)`
			`lwz $h3,12($ctx)`
			`lwz $h4,16($ctx)`

			`srwi $s1,$r1,2`
			`srwi $s2,$r2,2`
			`srwi $s3,$r3,2`
			`add $s1,$s1,$r1 # si = ri + ri>>2`
			`add $s2,$s2,$r2`
			`add $s3,$s3,$r3`
			`mtctr $len`
			`li $mask,3`
			`b Loop`

			`.align 4`
			`Loop:`
			`___`
			`$code.=<<___ if ($LITTLE_ENDIAN);`
			`lwz $d0,0($inp) # load input`
			`lwz $d1,4($inp)`
			`lwz $d2,8($inp)`
			`lwz $d3,12($inp)`
			`___`
			`$code.=<<___ if (!$LITTLE_ENDIAN);`
			`li $d1,4`
			`lwbrx $d0,0,$inp # load input`
			`li $d2,8`
			`lwbrx $d1,$d1,$inp`
			`li $d3,12`
			`lwbrx $d2,$d2,$inp`
			`lwbrx $d3,$d3,$inp`
			`___`
			`$code.=<<___;`
			`addi $inp,$inp,16`

			`addc $h0,$h0,$d0 # accumulate input`
			`adde $h1,$h1,$d1`
			`adde $h2,$h2,$d2`

			`mullw $d0,$h0,$r0 # h0*r0`
			`mulhwu $D0,$h0,$r0`

			`mullw $d1,$h0,$r1 # h0*r1`
			`mulhwu $D1,$h0,$r1`

			`mullw $d2,$h0,$r2 # h0*r2`
			`mulhwu $D2,$h0,$r2`

			`adde $h3,$h3,$d3`
			`adde $h4,$h4,$padbit`

			`mullw $d3,$h0,$r3 # h0*r3`
			`mulhwu $D3,$h0,$r3`

			`mullw $t0,$h1,$s3 # h1*s3`
			`mulhwu $t1,$h1,$s3`

			`mullw $t2,$h1,$r0 # h1*r0`
			`mulhwu $t3,$h1,$r0`
			`addc $d0,$d0,$t0`
			`adde $D0,$D0,$t1`

			`mullw $t0,$h1,$r1 # h1*r1`
			`mulhwu $t1,$h1,$r1`
			`addc $d1,$d1,$t2`
			`adde $D1,$D1,$t3`

			`mullw $t2,$h1,$r2 # h1*r2`
			`mulhwu $t3,$h1,$r2`
			`addc $d2,$d2,$t0`
			`adde $D2,$D2,$t1`

			`mullw $t0,$h2,$s2 # h2*s2`
			`mulhwu $t1,$h2,$s2`
			`addc $d3,$d3,$t2`
			`adde $D3,$D3,$t3`

			`mullw $t2,$h2,$s3 # h2*s3`
			`mulhwu $t3,$h2,$s3`
			`addc $d0,$d0,$t0`
			`adde $D0,$D0,$t1`

			`mullw $t0,$h2,$r0 # h2*r0`
			`mulhwu $t1,$h2,$r0`
			`addc $d1,$d1,$t2`
			`adde $D1,$D1,$t3`

			`mullw $t2,$h2,$r1 # h2*r1`
			`mulhwu $t3,$h2,$r1`
			`addc $d2,$d2,$t0`
			`adde $D2,$D2,$t1`

			`mullw $t0,$h3,$s1 # h3*s1`
			`mulhwu $t1,$h3,$s1`
			`addc $d3,$d3,$t2`
			`adde $D3,$D3,$t3`

			`mullw $t2,$h3,$s2 # h3*s2`
			`mulhwu $t3,$h3,$s2`
			`addc $d0,$d0,$t0`
			`adde $D0,$D0,$t1`

			`mullw $t0,$h3,$s3 # h3*s3`
			`mulhwu $t1,$h3,$s3`
			`addc $d1,$d1,$t2`
			`adde $D1,$D1,$t3`

			`mullw $t2,$h3,$r0 # h3*r0`
			`mulhwu $t3,$h3,$r0`
			`addc $d2,$d2,$t0`
			`adde $D2,$D2,$t1`

			`mullw $t0,$h4,$s1 # h4*s1`
			`addc $d3,$d3,$t2`
			`adde $D3,$D3,$t3`
			`addc $d1,$d1,$t0`

			`mullw $t1,$h4,$s2 # h4*s2`
			`addze $D1,$D1`
			`addc $d2,$d2,$t1`
			`addze $D2,$D2`

			`mullw $t2,$h4,$s3 # h4*s3`
			`addc $d3,$d3,$t2`
			`addze $D3,$D3`

			`mullw $h4,$h4,$r0 # h4*r0`

			`addc $h1,$d1,$D0`
			`adde $h2,$d2,$D1`
			`adde $h3,$d3,$D2`
			`adde $h4,$h4,$D3`

			`andc $D0,$h4,$mask # final reduction step`
			`and $h4,$h4,$mask`
			`srwi $D1,$D0,2`
			`add $D0,$D0,$D1`
			`addc $h0,$d0,$D0`
			`addze $h1,$h1`
			`addze $h2,$h2`
			`addze $h3,$h3`
crypto/poly1305: don't break carry chains. RT#4483 [poly1305-armv4.pl: remove redundant #ifdef __thumb2__] [poly1305-ppc*.pl: presumably more accurate benchmark results] Reviewed-by: Richard Levitte <levitte@openssl.org> 2016-03-29 08:02:45 +00:00			`addze $h4,$h4`
PPC assembly pack: add ChaCha20 and Poly1305 modules. Reviewed-by: Richard Levitte <levitte@openssl.org> 2016-02-10 10:51:23 +00:00
			`bdnz Loop`

			`stw $h0,0($ctx) # store hash value`
			`stw $h1,4($ctx)`
			`stw $h2,8($ctx)`
			`stw $h3,12($ctx)`
			`stw $h4,16($ctx)`

			$POP r14,`$FRAME-$SIZE_T*18`($sp)
			$POP r15,`$FRAME-$SIZE_T*17`($sp)
			$POP r16,`$FRAME-$SIZE_T*16`($sp)
			$POP r17,`$FRAME-$SIZE_T*15`($sp)
			$POP r18,`$FRAME-$SIZE_T*14`($sp)
			$POP r19,`$FRAME-$SIZE_T*13`($sp)
			$POP r20,`$FRAME-$SIZE_T*12`($sp)
			$POP r21,`$FRAME-$SIZE_T*11`($sp)
			$POP r22,`$FRAME-$SIZE_T*10`($sp)
			$POP r23,`$FRAME-$SIZE_T*9`($sp)
			$POP r24,`$FRAME-$SIZE_T*8`($sp)
			$POP r25,`$FRAME-$SIZE_T*7`($sp)
			$POP r26,`$FRAME-$SIZE_T*6`($sp)
			$POP r27,`$FRAME-$SIZE_T*5`($sp)
			$POP r28,`$FRAME-$SIZE_T*4`($sp)
			$POP r29,`$FRAME-$SIZE_T*3`($sp)
			$POP r30,`$FRAME-$SIZE_T*2`($sp)
			$POP r31,`$FRAME-$SIZE_T*1`($sp)
			`addi $sp,$sp,$FRAME`
			`Labort:`
			`blr`
			`.long 0`
			`.byte 0,12,4,1,0x80,18,4,0`
			`.size .poly1305_blocks,.-.poly1305_blocks`

			`.globl .poly1305_emit`
			`.align 4`
			`.poly1305_emit:`
			`$STU $sp,-$FRAME($sp)`
			`mflr r0`
			$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
			$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
			$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
			$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
			$PUSH r0,`$FRAME+$LRSAVE`($sp)

			`lwz $h0,0($ctx) # load hash`
			`lwz $h1,4($ctx)`
			`lwz $h2,8($ctx)`
			`lwz $h3,12($ctx)`
			`lwz $h4,16($ctx)`

			`addic $d0,$h0,5 # compare to modulus`
			`addze $d1,$h1`
			`addze $d2,$h2`
			`addze $d3,$h3`
			`addze $mask,$h4`

			`srwi $mask,$mask,2 # did it carry/borrow?`
			`neg $mask,$mask`

			`andc $h0,$h0,$mask`
			`and $d0,$d0,$mask`
			`andc $h1,$h1,$mask`
			`and $d1,$d1,$mask`
			`or $h0,$h0,$d0`
			`lwz $d0,0($nonce) # load nonce`
			`andc $h2,$h2,$mask`
			`and $d2,$d2,$mask`
			`or $h1,$h1,$d1`
			`lwz $d1,4($nonce)`
			`andc $h3,$h3,$mask`
			`and $d3,$d3,$mask`
			`or $h2,$h2,$d2`
			`lwz $d2,8($nonce)`
			`or $h3,$h3,$d3`
			`lwz $d3,12($nonce)`

			`addc $h0,$h0,$d0 # accumulate nonce`
			`adde $h1,$h1,$d1`
			`adde $h2,$h2,$d2`
			`adde $h3,$h3,$d3`
			`___`
			`$code.=<<___ if ($LITTLE_ENDIAN);`
			`stw $h0,0($mac) # write result`
			`stw $h1,4($mac)`
			`stw $h2,8($mac)`
			`stw $h3,12($mac)`
			`___`
			`$code.=<<___ if (!$LITTLE_ENDIAN);`
			`li $d1,4`
			`stwbrx $h0,0,$mac # write result`
			`li $d2,8`
			`stwbrx $h1,$d1,$mac`
			`li $d3,12`
			`stwbrx $h2,$d2,$mac`
			`stwbrx $h3,$d3,$mac`
			`___`
			`$code.=<<___;`
			$POP r28,`$FRAME-$SIZE_T*4`($sp)
			$POP r29,`$FRAME-$SIZE_T*3`($sp)
			$POP r30,`$FRAME-$SIZE_T*2`($sp)
			$POP r31,`$FRAME-$SIZE_T*1`($sp)
			`addi $sp,$sp,$FRAME`
			`blr`
			`.long 0`
			`.byte 0,12,4,1,0x80,4,3,0`
			`.size .poly1305_emit,.-.poly1305_emit`
			`___`
			`}`
			`$code.=<<___;`
			`.asciz "Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>"`
			`___`

			$code =~ s/\`([^\`]*)\`/eval $1/gem;
			`print $code;`
			`close STDOUT;`