0049eb46e4
Reviewed-by: Richard Levitte <levitte@openssl.org>
1097 lines
23 KiB
Raku
Executable file
1097 lines
23 KiB
Raku
Executable file
#!/usr/bin/env perl
|
||
#
|
||
# ====================================================================
|
||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
# project. The module is, however, dual licensed under OpenSSL and
|
||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
# ====================================================================
|
||
#
|
||
# This module implements Poly1305 hash for SPARCv9, vanilla, as well
|
||
# as VIS3 and FMA extensions.
|
||
#
|
||
# May, August 2015
|
||
#
|
||
# Numbers are cycles per processed byte with poly1305_blocks alone.
|
||
#
|
||
# IALU(*) FMA
|
||
#
|
||
# UltraSPARC III 11.9(**)
|
||
# SPARC T3 7.85
|
||
# SPARC T4 1.67(***) 6.55
|
||
# SPARC64 X 5.54 3.64
|
||
#
|
||
# (*) Comparison to compiler-generated code is really problematic,
|
||
# because latter's performance varies too much depending on too
|
||
# many variables. For example, one can measure from 5x to 15x
|
||
# improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
|
||
# unfair comparison, because compiler doesn't use VIS3, but
|
||
# given same initial conditions coefficient varies from 3x to 9x.
|
||
# (**) Pre-III performance should be even worse; floating-point
|
||
# performance for UltraSPARC I-IV on the other hand is reported
|
||
# to be 4.25 for hand-coded assembly, but they are just too old
|
||
# to care about.
|
||
# (***) Multi-process benchmark saturates at ~12.5x single-process
|
||
# result on 8-core processor, or ~21GBps per 2.85GHz socket.
|
||
|
||
my ($ctx,$inp,$len,$padbit,$shl,$shr) = map("%i$_",(0..5));
|
||
my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4) = map("%l$_",(0..7));
|
||
my ($h0,$h1,$h2,$h3, $t0,$t1,$t2) = map("%o$_",(0..5,7));
|
||
my ($d0,$d1,$d2,$d3) = map("%g$_",(1..4));
|
||
|
||
$code.=<<___;
|
||
#include "sparc_arch.h"
|
||
|
||
#ifdef __arch64__
|
||
.register %g2,#scratch
|
||
.register %g3,#scratch
|
||
# define STPTR stx
|
||
# define SIZE_T 8
|
||
#else
|
||
# define STPTR st
|
||
# define SIZE_T 4
|
||
#endif
|
||
#define LOCALS (STACK_BIAS+STACK_FRAME)
|
||
|
||
.section ".text",#alloc,#execinstr
|
||
|
||
#ifdef __PIC__
|
||
SPARC_PIC_THUNK(%g1)
|
||
#endif
|
||
|
||
.globl poly1305_init
|
||
.align 32
|
||
poly1305_init:
|
||
save %sp,-STACK_FRAME-16,%sp
|
||
nop
|
||
|
||
SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
|
||
ld [%g1],%g1
|
||
|
||
and %g1,SPARCV9_FMADD|SPARCV9_PREFER_FPU|SPARCV9_VIS3,%g1
|
||
cmp %g1,SPARCV9_FMADD|SPARCV9_PREFER_FPU
|
||
be .Lpoly1305_init_fma
|
||
nop
|
||
|
||
stx %g0,[$ctx+0]
|
||
stx %g0,[$ctx+8] ! zero hash value
|
||
brz,pn $inp,.Lno_key
|
||
stx %g0,[$ctx+16]
|
||
|
||
and $inp,7,$shr ! alignment factor
|
||
andn $inp,7,$inp
|
||
sll $shr,3,$shr ! *8
|
||
neg $shr,$shl
|
||
|
||
sethi %hi(0x0ffffffc),$t0
|
||
set 8,$h1
|
||
or $t0,%lo(0x0ffffffc),$t0
|
||
set 16,$h2
|
||
sllx $t0,32,$t1
|
||
or $t0,$t1,$t1 ! 0x0ffffffc0ffffffc
|
||
or $t1,3,$t0 ! 0x0ffffffc0fffffff
|
||
|
||
ldxa [$inp+%g0]0x88,$h0 ! load little-endian key
|
||
brz,pt $shr,.Lkey_aligned
|
||
ldxa [$inp+$h1]0x88,$h1
|
||
|
||
ldxa [$inp+$h2]0x88,$h2
|
||
srlx $h0,$shr,$h0
|
||
sllx $h1,$shl,$t2
|
||
srlx $h1,$shr,$h1
|
||
or $t2,$h0,$h0
|
||
sllx $h2,$shl,$h2
|
||
or $h2,$h1,$h1
|
||
|
||
.Lkey_aligned:
|
||
and $t0,$h0,$h0
|
||
and $t1,$h1,$h1
|
||
stx $h0,[$ctx+32+0] ! store key
|
||
stx $h1,[$ctx+32+8]
|
||
|
||
andcc %g1,SPARCV9_VIS3,%g0
|
||
be .Lno_key
|
||
nop
|
||
|
||
1: call .+8
|
||
add %o7,poly1305_blocks_vis3-1b,%o7
|
||
|
||
add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
|
||
STPTR %o7,[%i2]
|
||
STPTR %o5,[%i2+SIZE_T]
|
||
|
||
ret
|
||
restore %g0,1,%o0 ! return 1
|
||
|
||
.Lno_key:
|
||
ret
|
||
restore %g0,%g0,%o0 ! return 0
|
||
.size poly1305_init,.-poly1305_init
|
||
|
||
.globl poly1305_blocks
|
||
.align 32
|
||
poly1305_blocks:
|
||
save %sp,-STACK_FRAME,%sp
|
||
andn $len,15,$len
|
||
|
||
brz,pn $len,.Lno_data
|
||
nop
|
||
|
||
ld [$ctx+32+0],$r1 ! load key
|
||
ld [$ctx+32+4],$r0
|
||
ld [$ctx+32+8],$r3
|
||
ld [$ctx+32+12],$r2
|
||
|
||
ld [$ctx+0],$h1 ! load hash value
|
||
ld [$ctx+4],$h0
|
||
ld [$ctx+8],$h3
|
||
ld [$ctx+12],$h2
|
||
ld [$ctx+16],$h4
|
||
|
||
and $inp,7,$shr ! alignment factor
|
||
andn $inp,7,$inp
|
||
set 8,$d1
|
||
sll $shr,3,$shr ! *8
|
||
set 16,$d2
|
||
neg $shr,$shl
|
||
|
||
srl $r1,2,$s1
|
||
srl $r2,2,$s2
|
||
add $r1,$s1,$s1
|
||
srl $r3,2,$s3
|
||
add $r2,$s2,$s2
|
||
add $r3,$s3,$s3
|
||
|
||
.Loop:
|
||
ldxa [$inp+%g0]0x88,$d0 ! load little-endian input
|
||
brz,pt $shr,.Linp_aligned
|
||
ldxa [$inp+$d1]0x88,$d1
|
||
|
||
ldxa [$inp+$d2]0x88,$d2
|
||
srlx $d0,$shr,$d0
|
||
sllx $d1,$shl,$t1
|
||
srlx $d1,$shr,$d1
|
||
or $t1,$d0,$d0
|
||
sllx $d2,$shl,$d2
|
||
or $d2,$d1,$d1
|
||
|
||
.Linp_aligned:
|
||
srlx $d0,32,$t0
|
||
addcc $d0,$h0,$h0 ! accumulate input
|
||
srlx $d1,32,$t1
|
||
addccc $t0,$h1,$h1
|
||
addccc $d1,$h2,$h2
|
||
addccc $t1,$h3,$h3
|
||
addc $padbit,$h4,$h4
|
||
|
||
umul $r0,$h0,$d0
|
||
umul $r1,$h0,$d1
|
||
umul $r2,$h0,$d2
|
||
umul $r3,$h0,$d3
|
||
sub $len,16,$len
|
||
add $inp,16,$inp
|
||
|
||
umul $s3,$h1,$t0
|
||
umul $r0,$h1,$t1
|
||
umul $r1,$h1,$t2
|
||
add $t0,$d0,$d0
|
||
add $t1,$d1,$d1
|
||
umul $r2,$h1,$t0
|
||
add $t2,$d2,$d2
|
||
add $t0,$d3,$d3
|
||
|
||
umul $s2,$h2,$t1
|
||
umul $s3,$h2,$t2
|
||
umul $r0,$h2,$t0
|
||
add $t1,$d0,$d0
|
||
add $t2,$d1,$d1
|
||
umul $r1,$h2,$t1
|
||
add $t0,$d2,$d2
|
||
add $t1,$d3,$d3
|
||
|
||
umul $s1,$h3,$t2
|
||
umul $s2,$h3,$t0
|
||
umul $s3,$h3,$t1
|
||
add $t2,$d0,$d0
|
||
add $t0,$d1,$d1
|
||
umul $r0,$h3,$t2
|
||
add $t1,$d2,$d2
|
||
add $t2,$d3,$d3
|
||
|
||
umul $s1,$h4,$t0
|
||
umul $s2,$h4,$t1
|
||
umul $s3,$h4,$t2
|
||
umul $r0,$h4,$h4
|
||
add $t0,$d1,$d1
|
||
add $t1,$d2,$d2
|
||
srlx $d0,32,$h1
|
||
add $t2,$d3,$d3
|
||
srlx $d1,32,$h2
|
||
|
||
addcc $d1,$h1,$h1
|
||
srlx $d2,32,$h3
|
||
set 8,$d1
|
||
addccc $d2,$h2,$h2
|
||
srlx $d3,32,$t0
|
||
set 16,$d2
|
||
addccc $d3,$h3,$h3
|
||
addc $t0,$h4,$h4
|
||
|
||
srl $h4,2,$t0 ! final reduction step
|
||
andn $h4,3,$t1
|
||
and $h4,3,$h4
|
||
add $t1,$t0,$t0
|
||
|
||
addcc $t0,$d0,$h0
|
||
addccc %g0,$h1,$h1
|
||
addccc %g0,$h2,$h2
|
||
brnz,pt $len,.Loop
|
||
addc %g0,$h3,$h3
|
||
|
||
st $h1,[$ctx+0] ! store hash value
|
||
st $h0,[$ctx+4]
|
||
st $h3,[$ctx+8]
|
||
st $h2,[$ctx+12]
|
||
st $h4,[$ctx+16]
|
||
|
||
.Lno_data:
|
||
ret
|
||
restore
|
||
.size poly1305_blocks,.-poly1305_blocks
|
||
___
|
||
########################################################################
|
||
# VIS3 has umulxhi and addxc...
|
||
{
|
||
my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
|
||
my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
|
||
|
||
$code.=<<___;
|
||
.align 32
|
||
poly1305_blocks_vis3:
|
||
save %sp,-STACK_FRAME,%sp
|
||
andn $len,15,$len
|
||
|
||
brz,pn $len,.Lno_data
|
||
nop
|
||
|
||
ldx [$ctx+32+0],$R0 ! load key
|
||
ldx [$ctx+32+8],$R1
|
||
|
||
ldx [$ctx+0],$H0 ! load hash value
|
||
ldx [$ctx+8],$H1
|
||
ld [$ctx+16],$H2
|
||
|
||
and $inp,7,$shr ! alignment factor
|
||
andn $inp,7,$inp
|
||
set 8,$r1
|
||
sll $shr,3,$shr ! *8
|
||
set 16,$r2
|
||
neg $shr,$shl
|
||
|
||
srlx $R1,2,$S1
|
||
add $R1,$S1,$S1
|
||
|
||
.Loop_vis3:
|
||
ldxa [$inp+%g0]0x88,$D0 ! load little-endian input
|
||
brz,pt $shr,.Linp_aligned_vis3
|
||
ldxa [$inp+$r1]0x88,$D1
|
||
|
||
ldxa [$inp+$r2]0x88,$D2
|
||
srlx $D0,$shr,$D0
|
||
sllx $D1,$shl,$T1
|
||
srlx $D1,$shr,$D1
|
||
or $T1,$D0,$D0
|
||
sllx $D2,$shl,$D2
|
||
or $D2,$D1,$D1
|
||
|
||
.Linp_aligned_vis3:
|
||
addcc $D0,$H0,$H0 ! accumulate input
|
||
sub $len,16,$len
|
||
addxccc $D1,$H1,$H1
|
||
add $inp,16,$inp
|
||
|
||
mulx $R0,$H0,$D0 ! r0*h0
|
||
addxc $padbit,$H2,$H2
|
||
umulxhi $R0,$H0,$D1
|
||
mulx $S1,$H1,$T0 ! s1*h1
|
||
umulxhi $S1,$H1,$T1
|
||
addcc $T0,$D0,$D0
|
||
mulx $R1,$H0,$T0 ! r1*h0
|
||
addxc $T1,$D1,$D1
|
||
umulxhi $R1,$H0,$D2
|
||
addcc $T0,$D1,$D1
|
||
mulx $R0,$H1,$T0 ! r0*h1
|
||
addxc %g0,$D2,$D2
|
||
umulxhi $R0,$H1,$T1
|
||
addcc $T0,$D1,$D1
|
||
mulx $S1,$H2,$T0 ! s1*h2
|
||
addxc $T1,$D2,$D2
|
||
mulx $R0,$H2,$T1 ! r0*h2
|
||
addcc $T0,$D1,$D1
|
||
addxc $T1,$D2,$D2
|
||
|
||
srlx $D2,2,$T0 ! final reduction step
|
||
andn $D2,3,$T1
|
||
and $D2,3,$H2
|
||
add $T1,$T0,$T0
|
||
|
||
addcc $T0,$D0,$H0
|
||
brnz,pt $len,.Loop_vis3
|
||
addxc %g0,$D1,$H1
|
||
|
||
stx $H0,[$ctx+0] ! store hash value
|
||
stx $H1,[$ctx+8]
|
||
st $H2,[$ctx+16]
|
||
|
||
ret
|
||
restore
|
||
.size poly1305_blocks_vis3,.-poly1305_blocks_vis3
|
||
___
|
||
}
|
||
my ($mac,$nonce) = ($inp,$len);
|
||
|
||
$code.=<<___;
|
||
.globl poly1305_emit
|
||
.align 32
|
||
poly1305_emit:
|
||
save %sp,-STACK_FRAME,%sp
|
||
|
||
ld [$ctx+0],$h1 ! load hash value
|
||
ld [$ctx+4],$h0
|
||
ld [$ctx+8],$h3
|
||
ld [$ctx+12],$h2
|
||
ld [$ctx+16],$h4
|
||
|
||
addcc $h0,5,$r0 ! compare to modulus
|
||
addccc $h1,0,$r1
|
||
addccc $h2,0,$r2
|
||
addccc $h3,0,$r3
|
||
addc $h4,0,$h4
|
||
andcc $h4,4,%g0 ! did it carry/borrow?
|
||
|
||
movnz %icc,$r0,$h0
|
||
ld [$nonce+0],$r0 ! load nonce
|
||
movnz %icc,$r1,$h1
|
||
ld [$nonce+4],$r1
|
||
movnz %icc,$r2,$h2
|
||
ld [$nonce+8],$r2
|
||
movnz %icc,$r3,$h3
|
||
ld [$nonce+12],$r3
|
||
|
||
addcc $r0,$h0,$h0 ! accumulate nonce
|
||
addccc $r1,$h1,$h1
|
||
addccc $r2,$h2,$h2
|
||
addc $r3,$h3,$h3
|
||
|
||
srl $h0,8,$r0
|
||
stb $h0,[$mac+0] ! store little-endian result
|
||
srl $h0,16,$r1
|
||
stb $r0,[$mac+1]
|
||
srl $h0,24,$r2
|
||
stb $r1,[$mac+2]
|
||
stb $r2,[$mac+3]
|
||
|
||
srl $h1,8,$r0
|
||
stb $h1,[$mac+4]
|
||
srl $h1,16,$r1
|
||
stb $r0,[$mac+5]
|
||
srl $h1,24,$r2
|
||
stb $r1,[$mac+6]
|
||
stb $r2,[$mac+7]
|
||
|
||
srl $h2,8,$r0
|
||
stb $h2,[$mac+8]
|
||
srl $h2,16,$r1
|
||
stb $r0,[$mac+9]
|
||
srl $h2,24,$r2
|
||
stb $r1,[$mac+10]
|
||
stb $r2,[$mac+11]
|
||
|
||
srl $h3,8,$r0
|
||
stb $h3,[$mac+12]
|
||
srl $h3,16,$r1
|
||
stb $r0,[$mac+13]
|
||
srl $h3,24,$r2
|
||
stb $r1,[$mac+14]
|
||
stb $r2,[$mac+15]
|
||
|
||
ret
|
||
restore
|
||
.size poly1305_emit,.-poly1305_emit
|
||
___
|
||
|
||
{
|
||
my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
|
||
my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
|
||
my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
|
||
my $i2=$step;
|
||
|
||
my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
|
||
$two0,$two32,$two64,$two96,$two130,$five_two130,
|
||
$r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
|
||
$s2lo,$s2hi,$s3lo,$s3hi,
|
||
$c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
|
||
# borrowings
|
||
my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
|
||
my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
|
||
my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
|
||
|
||
$code.=<<___;
|
||
.align 32
|
||
poly1305_init_fma:
|
||
save %sp,-STACK_FRAME-16,%sp
|
||
nop
|
||
|
||
.Lpoly1305_init_fma:
|
||
1: call .+8
|
||
add %o7,.Lconsts_fma-1b,%o7
|
||
|
||
ldd [%o7+8*0],$two0 ! load constants
|
||
ldd [%o7+8*1],$two32
|
||
ldd [%o7+8*2],$two64
|
||
ldd [%o7+8*3],$two96
|
||
ldd [%o7+8*5],$five_two130
|
||
|
||
std $two0,[$ctx+8*0] ! initial hash value, biased 0
|
||
std $two32,[$ctx+8*1]
|
||
std $two64,[$ctx+8*2]
|
||
std $two96,[$ctx+8*3]
|
||
|
||
brz,pn $inp,.Lno_key_fma
|
||
nop
|
||
|
||
stx %fsr,[%sp+LOCALS] ! save original %fsr
|
||
ldx [%o7+8*6],%fsr ! load new %fsr
|
||
|
||
std $two0,[$ctx+8*4] ! key "template"
|
||
std $two32,[$ctx+8*5]
|
||
std $two64,[$ctx+8*6]
|
||
std $two96,[$ctx+8*7]
|
||
|
||
and $inp,7,$shr
|
||
andn $inp,7,$inp ! align pointer
|
||
mov 8,$i1
|
||
sll $shr,3,$shr
|
||
mov 16,$i2
|
||
neg $shr,$shl
|
||
|
||
ldxa [$inp+%g0]0x88,$in0 ! load little-endian key
|
||
ldxa [$inp+$i1]0x88,$in2
|
||
|
||
brz $shr,.Lkey_aligned_fma
|
||
sethi %hi(0xf0000000),$i1 ! 0xf0000000
|
||
|
||
ldxa [$inp+$i2]0x88,$in4
|
||
|
||
srlx $in0,$shr,$in0 ! align data
|
||
sllx $in2,$shl,$in1
|
||
srlx $in2,$shr,$in2
|
||
or $in1,$in0,$in0
|
||
sllx $in4,$shl,$in3
|
||
or $in3,$in2,$in2
|
||
|
||
.Lkey_aligned_fma:
|
||
or $i1,3,$i2 ! 0xf0000003
|
||
srlx $in0,32,$in1
|
||
andn $in0,$i1,$in0 ! &=0x0fffffff
|
||
andn $in1,$i2,$in1 ! &=0x0ffffffc
|
||
srlx $in2,32,$in3
|
||
andn $in2,$i2,$in2
|
||
andn $in3,$i2,$in3
|
||
|
||
st $in0,[$ctx+`8*4+4`] ! fill "template"
|
||
st $in1,[$ctx+`8*5+4`]
|
||
st $in2,[$ctx+`8*6+4`]
|
||
st $in3,[$ctx+`8*7+4`]
|
||
|
||
ldd [$ctx+8*4],$h0lo ! load [biased] key
|
||
ldd [$ctx+8*5],$h1lo
|
||
ldd [$ctx+8*6],$h2lo
|
||
ldd [$ctx+8*7],$h3lo
|
||
|
||
fsubd $h0lo,$two0, $h0lo ! r0
|
||
ldd [%o7+8*7],$two0 ! more constants
|
||
fsubd $h1lo,$two32,$h1lo ! r1
|
||
ldd [%o7+8*8],$two32
|
||
fsubd $h2lo,$two64,$h2lo ! r2
|
||
ldd [%o7+8*9],$two64
|
||
fsubd $h3lo,$two96,$h3lo ! r3
|
||
ldd [%o7+8*10],$two96
|
||
|
||
fmuld $five_two130,$h1lo,$s1lo ! s1
|
||
fmuld $five_two130,$h2lo,$s2lo ! s2
|
||
fmuld $five_two130,$h3lo,$s3lo ! s3
|
||
|
||
faddd $h0lo,$two0, $h0hi
|
||
faddd $h1lo,$two32,$h1hi
|
||
faddd $h2lo,$two64,$h2hi
|
||
faddd $h3lo,$two96,$h3hi
|
||
|
||
fsubd $h0hi,$two0, $h0hi
|
||
ldd [%o7+8*11],$two0 ! more constants
|
||
fsubd $h1hi,$two32,$h1hi
|
||
ldd [%o7+8*12],$two32
|
||
fsubd $h2hi,$two64,$h2hi
|
||
ldd [%o7+8*13],$two64
|
||
fsubd $h3hi,$two96,$h3hi
|
||
|
||
fsubd $h0lo,$h0hi,$h0lo
|
||
std $h0hi,[$ctx+8*5] ! r0hi
|
||
fsubd $h1lo,$h1hi,$h1lo
|
||
std $h1hi,[$ctx+8*7] ! r1hi
|
||
fsubd $h2lo,$h2hi,$h2lo
|
||
std $h2hi,[$ctx+8*9] ! r2hi
|
||
fsubd $h3lo,$h3hi,$h3lo
|
||
std $h3hi,[$ctx+8*11] ! r3hi
|
||
|
||
faddd $s1lo,$two0, $s1hi
|
||
faddd $s2lo,$two32,$s2hi
|
||
faddd $s3lo,$two64,$s3hi
|
||
|
||
fsubd $s1hi,$two0, $s1hi
|
||
fsubd $s2hi,$two32,$s2hi
|
||
fsubd $s3hi,$two64,$s3hi
|
||
|
||
fsubd $s1lo,$s1hi,$s1lo
|
||
fsubd $s2lo,$s2hi,$s2lo
|
||
fsubd $s3lo,$s3hi,$s3lo
|
||
|
||
ldx [%sp+LOCALS],%fsr ! restore %fsr
|
||
|
||
std $h0lo,[$ctx+8*4] ! r0lo
|
||
std $h1lo,[$ctx+8*6] ! r1lo
|
||
std $h2lo,[$ctx+8*8] ! r2lo
|
||
std $h3lo,[$ctx+8*10] ! r3lo
|
||
|
||
std $s1hi,[$ctx+8*13]
|
||
std $s2hi,[$ctx+8*15]
|
||
std $s3hi,[$ctx+8*17]
|
||
|
||
std $s1lo,[$ctx+8*12]
|
||
std $s2lo,[$ctx+8*14]
|
||
std $s3lo,[$ctx+8*16]
|
||
|
||
add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
|
||
add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
|
||
STPTR %o0,[%i2]
|
||
STPTR %o1,[%i2+SIZE_T]
|
||
|
||
ret
|
||
restore %g0,1,%o0 ! return 1
|
||
|
||
.Lno_key_fma:
|
||
ret
|
||
restore %g0,%g0,%o0 ! return 0
|
||
.size poly1305_init_fma,.-poly1305_init_fma
|
||
|
||
.align 32
|
||
poly1305_blocks_fma:
|
||
save %sp,-STACK_FRAME-48,%sp
|
||
srlx $len,4,$len
|
||
|
||
brz,pn $len,.Labort
|
||
sub $len,1,$len
|
||
|
||
1: call .+8
|
||
add %o7,.Lconsts_fma-1b,%o7
|
||
|
||
ldd [%o7+8*0],$two0 ! load constants
|
||
ldd [%o7+8*1],$two32
|
||
ldd [%o7+8*2],$two64
|
||
ldd [%o7+8*3],$two96
|
||
ldd [%o7+8*4],$two130
|
||
ldd [%o7+8*5],$five_two130
|
||
|
||
ldd [$ctx+8*0],$h0lo ! load [biased] hash value
|
||
ldd [$ctx+8*1],$h1lo
|
||
ldd [$ctx+8*2],$h2lo
|
||
ldd [$ctx+8*3],$h3lo
|
||
|
||
std $two0,[%sp+LOCALS+8*0] ! input "template"
|
||
sethi %hi((1023+52+96)<<20),$in3
|
||
std $two32,[%sp+LOCALS+8*1]
|
||
or $padbit,$in3,$in3
|
||
std $two64,[%sp+LOCALS+8*2]
|
||
st $in3,[%sp+LOCALS+8*3]
|
||
|
||
and $inp,7,$shr
|
||
andn $inp,7,$inp ! align pointer
|
||
mov 8,$i1
|
||
sll $shr,3,$shr
|
||
mov 16,$step
|
||
neg $shr,$shl
|
||
|
||
ldxa [$inp+%g0]0x88,$in0 ! load little-endian input
|
||
brz $shr,.Linp_aligned_fma
|
||
ldxa [$inp+$i1]0x88,$in2
|
||
|
||
ldxa [$inp+$step]0x88,$in4
|
||
add $inp,8,$inp
|
||
|
||
srlx $in0,$shr,$in0 ! align data
|
||
sllx $in2,$shl,$in1
|
||
srlx $in2,$shr,$in2
|
||
or $in1,$in0,$in0
|
||
sllx $in4,$shl,$in3
|
||
srlx $in4,$shr,$in4 ! pre-shift
|
||
or $in3,$in2,$in2
|
||
|
||
.Linp_aligned_fma:
|
||
srlx $in0,32,$in1
|
||
movrz $len,0,$step
|
||
srlx $in2,32,$in3
|
||
add $step,$inp,$inp ! conditional advance
|
||
|
||
st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
|
||
st $in1,[%sp+LOCALS+8*1+4]
|
||
st $in2,[%sp+LOCALS+8*2+4]
|
||
st $in3,[%sp+LOCALS+8*3+4]
|
||
|
||
ldd [$ctx+8*4],$r0lo ! load key
|
||
ldd [$ctx+8*5],$r0hi
|
||
ldd [$ctx+8*6],$r1lo
|
||
ldd [$ctx+8*7],$r1hi
|
||
ldd [$ctx+8*8],$r2lo
|
||
ldd [$ctx+8*9],$r2hi
|
||
ldd [$ctx+8*10],$r3lo
|
||
ldd [$ctx+8*11],$r3hi
|
||
ldd [$ctx+8*12],$s1lo
|
||
ldd [$ctx+8*13],$s1hi
|
||
ldd [$ctx+8*14],$s2lo
|
||
ldd [$ctx+8*15],$s2hi
|
||
ldd [$ctx+8*16],$s3lo
|
||
ldd [$ctx+8*17],$s3hi
|
||
|
||
stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
|
||
ldx [%o7+8*6],%fsr ! load new %fsr
|
||
|
||
subcc $len,1,$len
|
||
movrz $len,0,$step
|
||
|
||
ldd [%sp+LOCALS+8*0],$x0 ! load biased input
|
||
ldd [%sp+LOCALS+8*1],$x1
|
||
ldd [%sp+LOCALS+8*2],$x2
|
||
ldd [%sp+LOCALS+8*3],$x3
|
||
|
||
fsubd $h0lo,$two0, $h0lo ! de-bias hash value
|
||
fsubd $h1lo,$two32,$h1lo
|
||
ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
|
||
fsubd $h2lo,$two64,$h2lo
|
||
fsubd $h3lo,$two96,$h3lo
|
||
ldxa [$inp+$i1]0x88,$in2
|
||
|
||
fsubd $x0,$two0, $x0 ! de-bias input
|
||
fsubd $x1,$two32,$x1
|
||
fsubd $x2,$two64,$x2
|
||
fsubd $x3,$two96,$x3
|
||
|
||
brz $shr,.Linp_aligned_fma2
|
||
add $step,$inp,$inp ! conditional advance
|
||
|
||
sllx $in0,$shl,$in1 ! align data
|
||
srlx $in0,$shr,$in3
|
||
or $in1,$in4,$in0
|
||
sllx $in2,$shl,$in1
|
||
srlx $in2,$shr,$in4 ! pre-shift
|
||
or $in3,$in1,$in2
|
||
.Linp_aligned_fma2:
|
||
srlx $in0,32,$in1
|
||
srlx $in2,32,$in3
|
||
|
||
faddd $h0lo,$x0,$x0 ! accumulate input
|
||
stw $in0,[%sp+LOCALS+8*0+4]
|
||
faddd $h1lo,$x1,$x1
|
||
stw $in1,[%sp+LOCALS+8*1+4]
|
||
faddd $h2lo,$x2,$x2
|
||
stw $in2,[%sp+LOCALS+8*2+4]
|
||
faddd $h3lo,$x3,$x3
|
||
stw $in3,[%sp+LOCALS+8*3+4]
|
||
|
||
b .Lentry_fma
|
||
nop
|
||
|
||
.align 16
|
||
.Loop_fma:
|
||
ldxa [$inp+%g0]0x88,$in0 ! modulo-scheduled input load
|
||
ldxa [$inp+$i1]0x88,$in2
|
||
movrz $len,0,$step
|
||
|
||
faddd $y0,$h0lo,$h0lo ! accumulate input
|
||
faddd $y1,$h0hi,$h0hi
|
||
faddd $y2,$h2lo,$h2lo
|
||
faddd $y3,$h2hi,$h2hi
|
||
|
||
brz,pn $shr,.Linp_aligned_fma3
|
||
add $step,$inp,$inp ! conditional advance
|
||
|
||
sllx $in0,$shl,$in1 ! align data
|
||
srlx $in0,$shr,$in3
|
||
or $in1,$in4,$in0
|
||
sllx $in2,$shl,$in1
|
||
srlx $in2,$shr,$in4 ! pre-shift
|
||
or $in3,$in1,$in2
|
||
|
||
.Linp_aligned_fma3:
|
||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
|
||
faddd $two64,$h1lo,$c1lo
|
||
srlx $in0,32,$in1
|
||
faddd $two64,$h1hi,$c1hi
|
||
srlx $in2,32,$in3
|
||
faddd $two130,$h3lo,$c3lo
|
||
st $in0,[%sp+LOCALS+8*0+4] ! fill "template"
|
||
faddd $two130,$h3hi,$c3hi
|
||
st $in1,[%sp+LOCALS+8*1+4]
|
||
faddd $two32,$h0lo,$c0lo
|
||
st $in2,[%sp+LOCALS+8*2+4]
|
||
faddd $two32,$h0hi,$c0hi
|
||
st $in3,[%sp+LOCALS+8*3+4]
|
||
faddd $two96,$h2lo,$c2lo
|
||
faddd $two96,$h2hi,$c2hi
|
||
|
||
fsubd $c1lo,$two64,$c1lo
|
||
fsubd $c1hi,$two64,$c1hi
|
||
fsubd $c3lo,$two130,$c3lo
|
||
fsubd $c3hi,$two130,$c3hi
|
||
fsubd $c0lo,$two32,$c0lo
|
||
fsubd $c0hi,$two32,$c0hi
|
||
fsubd $c2lo,$two96,$c2lo
|
||
fsubd $c2hi,$two96,$c2hi
|
||
|
||
fsubd $h1lo,$c1lo,$h1lo
|
||
fsubd $h1hi,$c1hi,$h1hi
|
||
fsubd $h3lo,$c3lo,$h3lo
|
||
fsubd $h3hi,$c3hi,$h3hi
|
||
fsubd $h2lo,$c2lo,$h2lo
|
||
fsubd $h2hi,$c2hi,$h2hi
|
||
fsubd $h0lo,$c0lo,$h0lo
|
||
fsubd $h0hi,$c0hi,$h0hi
|
||
|
||
faddd $h1lo,$c0lo,$h1lo
|
||
faddd $h1hi,$c0hi,$h1hi
|
||
faddd $h3lo,$c2lo,$h3lo
|
||
faddd $h3hi,$c2hi,$h3hi
|
||
faddd $h2lo,$c1lo,$h2lo
|
||
faddd $h2hi,$c1hi,$h2hi
|
||
fmaddd $five_two130,$c3lo,$h0lo,$h0lo
|
||
fmaddd $five_two130,$c3hi,$h0hi,$h0hi
|
||
|
||
faddd $h1lo,$h1hi,$x1
|
||
ldd [$ctx+8*12],$s1lo ! reload constants
|
||
faddd $h3lo,$h3hi,$x3
|
||
ldd [$ctx+8*13],$s1hi
|
||
faddd $h2lo,$h2hi,$x2
|
||
ldd [$ctx+8*10],$r3lo
|
||
faddd $h0lo,$h0hi,$x0
|
||
ldd [$ctx+8*11],$r3hi
|
||
|
||
.Lentry_fma:
|
||
fmuld $x1,$s3lo,$h0lo
|
||
fmuld $x1,$s3hi,$h0hi
|
||
fmuld $x1,$r1lo,$h2lo
|
||
fmuld $x1,$r1hi,$h2hi
|
||
fmuld $x1,$r0lo,$h1lo
|
||
fmuld $x1,$r0hi,$h1hi
|
||
fmuld $x1,$r2lo,$h3lo
|
||
fmuld $x1,$r2hi,$h3hi
|
||
|
||
fmaddd $x3,$s1lo,$h0lo,$h0lo
|
||
fmaddd $x3,$s1hi,$h0hi,$h0hi
|
||
fmaddd $x3,$s3lo,$h2lo,$h2lo
|
||
fmaddd $x3,$s3hi,$h2hi,$h2hi
|
||
fmaddd $x3,$s2lo,$h1lo,$h1lo
|
||
fmaddd $x3,$s2hi,$h1hi,$h1hi
|
||
fmaddd $x3,$r0lo,$h3lo,$h3lo
|
||
fmaddd $x3,$r0hi,$h3hi,$h3hi
|
||
|
||
fmaddd $x2,$s2lo,$h0lo,$h0lo
|
||
fmaddd $x2,$s2hi,$h0hi,$h0hi
|
||
fmaddd $x2,$r0lo,$h2lo,$h2lo
|
||
fmaddd $x2,$r0hi,$h2hi,$h2hi
|
||
fmaddd $x2,$s3lo,$h1lo,$h1lo
|
||
ldd [%sp+LOCALS+8*0],$y0 ! load [biased] input
|
||
fmaddd $x2,$s3hi,$h1hi,$h1hi
|
||
ldd [%sp+LOCALS+8*1],$y1
|
||
fmaddd $x2,$r1lo,$h3lo,$h3lo
|
||
ldd [%sp+LOCALS+8*2],$y2
|
||
fmaddd $x2,$r1hi,$h3hi,$h3hi
|
||
ldd [%sp+LOCALS+8*3],$y3
|
||
|
||
fmaddd $x0,$r0lo,$h0lo,$h0lo
|
||
fsubd $y0,$two0, $y0 ! de-bias input
|
||
fmaddd $x0,$r0hi,$h0hi,$h0hi
|
||
fsubd $y1,$two32,$y1
|
||
fmaddd $x0,$r2lo,$h2lo,$h2lo
|
||
fsubd $y2,$two64,$y2
|
||
fmaddd $x0,$r2hi,$h2hi,$h2hi
|
||
fsubd $y3,$two96,$y3
|
||
fmaddd $x0,$r1lo,$h1lo,$h1lo
|
||
fmaddd $x0,$r1hi,$h1hi,$h1hi
|
||
fmaddd $x0,$r3lo,$h3lo,$h3lo
|
||
fmaddd $x0,$r3hi,$h3hi,$h3hi
|
||
|
||
bcc SIZE_T_CC,.Loop_fma
|
||
subcc $len,1,$len
|
||
|
||
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
|
||
faddd $h0lo,$two32,$c0lo
|
||
faddd $h0hi,$two32,$c0hi
|
||
faddd $h2lo,$two96,$c2lo
|
||
faddd $h2hi,$two96,$c2hi
|
||
faddd $h1lo,$two64,$c1lo
|
||
faddd $h1hi,$two64,$c1hi
|
||
faddd $h3lo,$two130,$c3lo
|
||
faddd $h3hi,$two130,$c3hi
|
||
|
||
fsubd $c0lo,$two32,$c0lo
|
||
fsubd $c0hi,$two32,$c0hi
|
||
fsubd $c2lo,$two96,$c2lo
|
||
fsubd $c2hi,$two96,$c2hi
|
||
fsubd $c1lo,$two64,$c1lo
|
||
fsubd $c1hi,$two64,$c1hi
|
||
fsubd $c3lo,$two130,$c3lo
|
||
fsubd $c3hi,$two130,$c3hi
|
||
|
||
fsubd $h1lo,$c1lo,$h1lo
|
||
fsubd $h1hi,$c1hi,$h1hi
|
||
fsubd $h3lo,$c3lo,$h3lo
|
||
fsubd $h3hi,$c3hi,$h3hi
|
||
fsubd $h2lo,$c2lo,$h2lo
|
||
fsubd $h2hi,$c2hi,$h2hi
|
||
fsubd $h0lo,$c0lo,$h0lo
|
||
fsubd $h0hi,$c0hi,$h0hi
|
||
|
||
faddd $h1lo,$c0lo,$h1lo
|
||
faddd $h1hi,$c0hi,$h1hi
|
||
faddd $h3lo,$c2lo,$h3lo
|
||
faddd $h3hi,$c2hi,$h3hi
|
||
faddd $h2lo,$c1lo,$h2lo
|
||
faddd $h2hi,$c1hi,$h2hi
|
||
fmaddd $five_two130,$c3lo,$h0lo,$h0lo
|
||
fmaddd $five_two130,$c3hi,$h0hi,$h0hi
|
||
|
||
faddd $h1lo,$h1hi,$x1
|
||
faddd $h3lo,$h3hi,$x3
|
||
faddd $h2lo,$h2hi,$x2
|
||
faddd $h0lo,$h0hi,$x0
|
||
|
||
faddd $x1,$two32,$x1 ! bias
|
||
faddd $x3,$two96,$x3
|
||
faddd $x2,$two64,$x2
|
||
faddd $x0,$two0, $x0
|
||
|
||
ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
|
||
|
||
std $x1,[$ctx+8*1] ! store [biased] hash value
|
||
std $x3,[$ctx+8*3]
|
||
std $x2,[$ctx+8*2]
|
||
std $x0,[$ctx+8*0]
|
||
|
||
.Labort:
|
||
ret
|
||
restore
|
||
.size poly1305_blocks_fma,.-poly1305_blocks_fma
|
||
___
|
||
{
|
||
my ($mac,$nonce)=($inp,$len);
|
||
|
||
my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
|
||
) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
|
||
|
||
$code.=<<___;
|
||
.align 32
|
||
poly1305_emit_fma:
|
||
save %sp,-STACK_FRAME,%sp
|
||
|
||
ld [$ctx+8*0+0],$d0 ! load hash
|
||
ld [$ctx+8*0+4],$h0
|
||
ld [$ctx+8*1+0],$d1
|
||
ld [$ctx+8*1+4],$h1
|
||
ld [$ctx+8*2+0],$d2
|
||
ld [$ctx+8*2+4],$h2
|
||
ld [$ctx+8*3+0],$d3
|
||
ld [$ctx+8*3+4],$h3
|
||
|
||
sethi %hi(0xfff00000),$mask
|
||
andn $d0,$mask,$d0 ! mask exponent
|
||
andn $d1,$mask,$d1
|
||
andn $d2,$mask,$d2
|
||
andn $d3,$mask,$d3 ! can be partially reduced...
|
||
mov 3,$mask
|
||
|
||
srl $d3,2,$padbit ! ... so reduce
|
||
and $d3,$mask,$h4
|
||
andn $d3,$mask,$d3
|
||
add $padbit,$d3,$d3
|
||
|
||
addcc $d3,$h0,$h0
|
||
addccc $d0,$h1,$h1
|
||
addccc $d1,$h2,$h2
|
||
addccc $d2,$h3,$h3
|
||
addc %g0,$h4,$h4
|
||
|
||
addcc $h0,5,$d0 ! compare to modulus
|
||
addccc $h1,0,$d1
|
||
addccc $h2,0,$d2
|
||
addccc $h3,0,$d3
|
||
addc $h4,0,$mask
|
||
|
||
srl $mask,2,$mask ! did it carry/borrow?
|
||
neg $mask,$mask
|
||
sra $mask,31,$mask ! mask
|
||
|
||
andn $h0,$mask,$h0
|
||
and $d0,$mask,$d0
|
||
andn $h1,$mask,$h1
|
||
and $d1,$mask,$d1
|
||
or $d0,$h0,$h0
|
||
ld [$nonce+0],$d0 ! load nonce
|
||
andn $h2,$mask,$h2
|
||
and $d2,$mask,$d2
|
||
or $d1,$h1,$h1
|
||
ld [$nonce+4],$d1
|
||
andn $h3,$mask,$h3
|
||
and $d3,$mask,$d3
|
||
or $d2,$h2,$h2
|
||
ld [$nonce+8],$d2
|
||
or $d3,$h3,$h3
|
||
ld [$nonce+12],$d3
|
||
|
||
addcc $d0,$h0,$h0 ! accumulate nonce
|
||
addccc $d1,$h1,$h1
|
||
addccc $d2,$h2,$h2
|
||
addc $d3,$h3,$h3
|
||
|
||
stb $h0,[$mac+0] ! write little-endian result
|
||
srl $h0,8,$h0
|
||
stb $h1,[$mac+4]
|
||
srl $h1,8,$h1
|
||
stb $h2,[$mac+8]
|
||
srl $h2,8,$h2
|
||
stb $h3,[$mac+12]
|
||
srl $h3,8,$h3
|
||
|
||
stb $h0,[$mac+1]
|
||
srl $h0,8,$h0
|
||
stb $h1,[$mac+5]
|
||
srl $h1,8,$h1
|
||
stb $h2,[$mac+9]
|
||
srl $h2,8,$h2
|
||
stb $h3,[$mac+13]
|
||
srl $h3,8,$h3
|
||
|
||
stb $h0,[$mac+2]
|
||
srl $h0,8,$h0
|
||
stb $h1,[$mac+6]
|
||
srl $h1,8,$h1
|
||
stb $h2,[$mac+10]
|
||
srl $h2,8,$h2
|
||
stb $h3,[$mac+14]
|
||
srl $h3,8,$h3
|
||
|
||
stb $h0,[$mac+3]
|
||
stb $h1,[$mac+7]
|
||
stb $h2,[$mac+11]
|
||
stb $h3,[$mac+15]
|
||
|
||
ret
|
||
restore
|
||
.size poly1305_emit_fma,.-poly1305_emit_fma
|
||
___
|
||
}
|
||
|
||
$code.=<<___;
|
||
.align 64
|
||
.Lconsts_fma:
|
||
.word 0x43300000,0x00000000 ! 2^(52+0)
|
||
.word 0x45300000,0x00000000 ! 2^(52+32)
|
||
.word 0x47300000,0x00000000 ! 2^(52+64)
|
||
.word 0x49300000,0x00000000 ! 2^(52+96)
|
||
.word 0x4b500000,0x00000000 ! 2^(52+130)
|
||
|
||
.word 0x37f40000,0x00000000 ! 5/2^130
|
||
.word 0,1<<30 ! fsr: truncate, no exceptions
|
||
|
||
.word 0x44300000,0x00000000 ! 2^(52+16+0)
|
||
.word 0x46300000,0x00000000 ! 2^(52+16+32)
|
||
.word 0x48300000,0x00000000 ! 2^(52+16+64)
|
||
.word 0x4a300000,0x00000000 ! 2^(52+16+96)
|
||
.word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
|
||
.word 0x40300000,0x00000000 ! 2^(52+16+32-96)
|
||
.word 0x42300000,0x00000000 ! 2^(52+16+64-96)
|
||
.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
|
||
.align 4
|
||
___
|
||
}
|
||
|
||
# Purpose of these subroutines is to explicitly encode VIS instructions,
|
||
# so that one can compile the module without having to specify VIS
|
||
# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
|
||
# Idea is to reserve for option to produce "universal" binary and let
|
||
# programmer detect if current CPU is VIS capable at run-time.
|
||
sub unvis3 {
|
||
my ($mnemonic,$rs1,$rs2,$rd)=@_;
|
||
my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
|
||
my ($ref,$opf);
|
||
my %visopf = ( "addxc" => 0x011,
|
||
"addxccc" => 0x013,
|
||
"umulxhi" => 0x016 );
|
||
|
||
$ref = "$mnemonic\t$rs1,$rs2,$rd";
|
||
|
||
if ($opf=$visopf{$mnemonic}) {
|
||
foreach ($rs1,$rs2,$rd) {
|
||
return $ref if (!/%([goli])([0-9])/);
|
||
$_=$bias{$1}+$2;
|
||
}
|
||
|
||
return sprintf ".word\t0x%08x !%s",
|
||
0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
|
||
$ref;
|
||
} else {
|
||
return $ref;
|
||
}
|
||
}
|
||
|
||
sub unfma {
|
||
my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
|
||
my ($ref,$opf);
|
||
my %fmaopf = ( "fmadds" => 0x1,
|
||
"fmaddd" => 0x2,
|
||
"fmsubs" => 0x5,
|
||
"fmsubd" => 0x6 );
|
||
|
||
$ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
|
||
|
||
if ($opf=$fmaopf{$mnemonic}) {
|
||
foreach ($rs1,$rs2,$rs3,$rd) {
|
||
return $ref if (!/%f([0-9]{1,2})/);
|
||
$_=$1;
|
||
if ($1>=32) {
|
||
return $ref if ($1&1);
|
||
# re-encode for upper double register addressing
|
||
$_=($1|$1>>5)&31;
|
||
}
|
||
}
|
||
|
||
return sprintf ".word\t0x%08x !%s",
|
||
0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
|
||
$ref;
|
||
} else {
|
||
return $ref;
|
||
}
|
||
}
|
||
|
||
foreach (split("\n",$code)) {
|
||
s/\`([^\`]*)\`/eval $1/ge;
|
||
|
||
s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
|
||
&unvis3($1,$2,$3,$4)
|
||
/ge or
|
||
s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
|
||
&unfma($1,$2,$3,$4,$5)
|
||
/ge;
|
||
|
||
print $_,"\n";
|
||
}
|
||
|
||
close STDOUT;
|