Remove trailing whitespace from some files.

The prevailing style seems to not have trailing whitespace, but a few
lines do. This is mostly in the perlasm files, but a few C files got
them after the reformat. This is the result of:

  find . -name '*.pl' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//'
  find . -name '*.c' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//'
  find . -name '*.h' | xargs sed -E -i '' -e 's/( |'$'\t'')*$//'

Then bn_prime.h was excluded since this is a generated file.

Note mkerr.pl has some changes in a heredoc for some help output, but
other lines there lack trailing whitespace too.

Reviewed-by: Kurt Roeckx <kurt@openssl.org>
Reviewed-by: Matt Caswell <matt@openssl.org>
This commit is contained in:
David Benjamin 2016-10-10 12:01:24 -04:00 committed by Matt Caswell
parent 11542af65a
commit 609b0852e4
95 changed files with 390 additions and 390 deletions

View file

@ -146,7 +146,7 @@ OPTIONS cms_options[] = {
"Do not load certificates from the default certificates directory"},
{"content", OPT_CONTENT, '<',
"Supply or override content for detached signature"},
{"print", OPT_PRINT, '-',
{"print", OPT_PRINT, '-',
"For the -cmsout operation print out all fields of the CMS structure"},
{"secretkey", OPT_SECRETKEY, 's'},
{"secretkeyid", OPT_SECRETKEYID, 's'},

View file

@ -89,7 +89,7 @@ OPTIONS smime_options[] = {
{"no-CApath", OPT_NOCAPATH, '-',
"Do not load certificates from the default certificates directory"},
{"resign", OPT_RESIGN, '-', "Resign a signed message"},
{"nochain", OPT_NOCHAIN, '-',
{"nochain", OPT_NOCHAIN, '-',
"set PKCS7_NOCHAIN so certificates contained in the message are not used as untrusted CAs" },
{"nosmimecap", OPT_NOSMIMECAP, '-', "Omit the SMIMECapabilities attribute"},
{"stream", OPT_STREAM, '-', "Enable CMS streaming" },

View file

@ -1187,8 +1187,8 @@ static int run_benchmark(int async_jobs,
continue;
#endif
ret = ASYNC_start_job(&loopargs[i].inprogress_job,
loopargs[i].wait_ctx, &job_op_count, loop_function,
ret = ASYNC_start_job(&loopargs[i].inprogress_job,
loopargs[i].wait_ctx, &job_op_count, loop_function,
(void *)(loopargs + i), sizeof(loopargs_t));
switch (ret) {
case ASYNC_PAUSE:

View file

@ -123,7 +123,7 @@
# words every cache-line is *guaranteed* to be accessed within ~50
# cycles window. Why just SSE? Because it's needed on hyper-threading
# CPU! Which is also why it's prefetched with 64 byte stride. Best
# part is that it has no negative effect on performance:-)
# part is that it has no negative effect on performance:-)
#
# Version 4.3 implements switch between compact and non-compact block
# functions in AES_cbc_encrypt depending on how much data was asked
@ -585,7 +585,7 @@ sub enctransform()
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# | mm4 | mm0 |
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# | s3 | s2 | s1 | s0 |
# | s3 | s2 | s1 | s0 |
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
@ -805,7 +805,7 @@ sub encstep()
if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]);
else { &mov ($tmp,$s[3]);
&shr ($tmp,24) }
&xor ($out,&DWP(1,$te,$tmp,8));
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
@ -1558,7 +1558,7 @@ sub sse_deccompact()
&pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
&pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
&pxor ("mm3","mm3"); &pxor ("mm7","mm7");
&pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
@ -2028,7 +2028,7 @@ sub declast()
{
# stack frame layout
# -4(%esp) # return address 0(%esp)
# 0(%esp) # s0 backing store 4(%esp)
# 0(%esp) # s0 backing store 4(%esp)
# 4(%esp) # s1 backing store 8(%esp)
# 8(%esp) # s2 backing store 12(%esp)
# 12(%esp) # s3 backing store 16(%esp)
@ -2738,7 +2738,7 @@ sub enckey()
&mov (&DWP(80,"edi"),10); # setup number of rounds
&xor ("eax","eax");
&jmp (&label("exit"));
&set_label("12rounds");
&mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
&mov ("ebx",&DWP(4,"esi"));

View file

@ -1433,10 +1433,10 @@ $code.=<<___;
xor $s1,$s1,$acc05
xor $s2,$s2,$acc06
xor $s3,$s3,$acc07
xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
xor $s1,$s1,$acc09
xor $s2,$s2,$acc10
xor $s3,$s3,$acc11
xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
xor $s1,$s1,$acc09
xor $s2,$s2,$acc10
xor $s3,$s3,$acc11
b Ldec_compact_loop
.align 4

View file

@ -404,7 +404,7 @@ _s390x_AES_encrypt:
or $s1,$t1
or $t2,$i2
or $t3,$i3
srlg $i1,$s2,`8-3` # i0
srlg $i2,$s2,`16-3` # i1
nr $i1,$mask
@ -457,7 +457,7 @@ _s390x_AES_encrypt:
x $s2,24($key)
x $s3,28($key)
br $ra
br $ra
.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
___
@ -779,7 +779,7 @@ _s390x_AES_decrypt:
x $s2,24($key)
x $s3,28($key)
br $ra
br $ra
.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
___
@ -1297,7 +1297,7 @@ $code.=<<___;
.Lcbc_enc_done:
l${g} $ivp,6*$SIZE_T($sp)
st $s0,0($ivp)
st $s1,4($ivp)
st $s1,4($ivp)
st $s2,8($ivp)
st $s3,12($ivp)
@ -1635,7 +1635,7 @@ $code.=<<___ if(1);
llgc $len,2*$SIZE_T-1($sp)
nill $len,0x0f # $len%=16
br $ra
.align 16
.Lxts_km_vanilla:
___
@ -1862,7 +1862,7 @@ $code.=<<___;
xgr $s1,%r1
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32
@ -1913,7 +1913,7 @@ $code.=<<___;
xgr $s1,%r1
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32
@ -2105,7 +2105,7 @@ $code.=<<___;
xgr $s1,%r1
lrvgr $s1,$s1 # flip byte order
lrvgr $s3,$s3
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
stg $s1,$tweak+0($sp) # save the tweak
llgfr $s1,$s1
srlg $s2,$s3,32

View file

@ -1298,7 +1298,7 @@ $code.=<<___;
AES_set_encrypt_key:
push %rbx
push %rbp
push %r12 # redundant, but allows to share
push %r12 # redundant, but allows to share
push %r13 # exception handler...
push %r14
push %r15
@ -1424,7 +1424,7 @@ $code.=<<___;
xor %rax,%rax
jmp .Lexit
.L14rounds:
.L14rounds:
mov 0(%rsi),%rax # copy first 8 dwords
mov 8(%rsi),%rbx
mov 16(%rsi),%rcx

View file

@ -134,7 +134,7 @@ $code.=<<___ if ($win64);
movaps %xmm10,0x40(%rsp)
movaps %xmm11,0x50(%rsp)
movaps %xmm12,0x60(%rsp)
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
movaps %xmm14,-0x58(%rax)
movaps %xmm15,-0x48(%rax)
___
@ -308,9 +308,9 @@ $code.=<<___;
movups @out[0],-16(@outptr[0],$offset)
pxor @inp[0],@out[0]
movups @out[1],-16(@outptr[1],$offset)
movups @out[1],-16(@outptr[1],$offset)
pxor @inp[1],@out[1]
movups @out[2],-16(@outptr[2],$offset)
movups @out[2],-16(@outptr[2],$offset)
pxor @inp[2],@out[2]
movups @out[3],-16(@outptr[3],$offset)
pxor @inp[3],@out[3]
@ -393,7 +393,7 @@ $code.=<<___ if ($win64);
movaps %xmm10,0x40(%rsp)
movaps %xmm11,0x50(%rsp)
movaps %xmm12,0x60(%rsp)
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
movaps %xmm14,-0x58(%rax)
movaps %xmm15,-0x48(%rax)
___
@ -563,10 +563,10 @@ $code.=<<___;
movups @out[0],-16(@outptr[0],$offset)
movdqu (@inptr[0],$offset),@out[0]
movups @out[1],-16(@outptr[1],$offset)
movups @out[1],-16(@outptr[1],$offset)
movdqu (@inptr[1],$offset),@out[1]
pxor $zero,@out[0]
movups @out[2],-16(@outptr[2],$offset)
movups @out[2],-16(@outptr[2],$offset)
movdqu (@inptr[2],$offset),@out[2]
pxor $zero,@out[1]
movups @out[3],-16(@outptr[3],$offset)
@ -835,10 +835,10 @@ $code.=<<___;
vmovups @out[0],-16(@ptr[0]) # write output
sub $offset,@ptr[0] # switch to input
vpxor 0x00($offload),@out[0],@out[0]
vmovups @out[1],-16(@ptr[1])
vmovups @out[1],-16(@ptr[1])
sub `64+1*8`(%rsp),@ptr[1]
vpxor 0x10($offload),@out[1],@out[1]
vmovups @out[2],-16(@ptr[2])
vmovups @out[2],-16(@ptr[2])
sub `64+2*8`(%rsp),@ptr[2]
vpxor 0x20($offload),@out[2],@out[2]
vmovups @out[3],-16(@ptr[3])
@ -847,10 +847,10 @@ $code.=<<___;
vmovups @out[4],-16(@ptr[4])
sub `64+4*8`(%rsp),@ptr[4]
vpxor @inp[0],@out[4],@out[4]
vmovups @out[5],-16(@ptr[5])
vmovups @out[5],-16(@ptr[5])
sub `64+5*8`(%rsp),@ptr[5]
vpxor @inp[1],@out[5],@out[5]
vmovups @out[6],-16(@ptr[6])
vmovups @out[6],-16(@ptr[6])
sub `64+6*8`(%rsp),@ptr[6]
vpxor @inp[2],@out[6],@out[6]
vmovups @out[7],-16(@ptr[7])
@ -1128,12 +1128,12 @@ $code.=<<___;
sub $offset,@ptr[0] # switch to input
vmovdqu 128+0(%rsp),@out[0]
vpxor 0x70($offload),@out[7],@out[7]
vmovups @out[1],-16(@ptr[1])
vmovups @out[1],-16(@ptr[1])
sub `64+1*8`(%rsp),@ptr[1]
vmovdqu @out[0],0x00($offload)
vpxor $zero,@out[0],@out[0]
vmovdqu 128+16(%rsp),@out[1]
vmovups @out[2],-16(@ptr[2])
vmovups @out[2],-16(@ptr[2])
sub `64+2*8`(%rsp),@ptr[2]
vmovdqu @out[1],0x10($offload)
vpxor $zero,@out[1],@out[1]
@ -1149,11 +1149,11 @@ $code.=<<___;
vpxor $zero,@out[3],@out[3]
vmovdqu @inp[0],0x40($offload)
vpxor @inp[0],$zero,@out[4]
vmovups @out[5],-16(@ptr[5])
vmovups @out[5],-16(@ptr[5])
sub `64+5*8`(%rsp),@ptr[5]
vmovdqu @inp[1],0x50($offload)
vpxor @inp[1],$zero,@out[5]
vmovups @out[6],-16(@ptr[6])
vmovups @out[6],-16(@ptr[6])
sub `64+6*8`(%rsp),@ptr[6]
vmovdqu @inp[2],0x60($offload)
vpxor @inp[2],$zero,@out[6]

View file

@ -793,7 +793,7 @@ sub body_00_19_dec () { # ((c^d)&b)^d
sub body_20_39_dec () { # b^d^c
# on entry @T[0]=b^d
return &body_40_59_dec() if ($rx==39);
my @r=@body_20_39;
unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]);

View file

@ -884,7 +884,7 @@ if ($avx>1) {{
######################################################################
# AVX2+BMI code path
#
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;

View file

@ -1051,7 +1051,7 @@ if ($PREFIX eq "aesni") {
&set_label("ctr32_one_shortcut",16);
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
&mov ($rounds,&DWP(240,$key));
&set_label("ctr32_one");
if ($inline)
{ &aesni_inline_generate1("enc"); }

View file

@ -34,7 +34,7 @@
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
#
@ -118,7 +118,7 @@
# performance is achieved by interleaving instructions working on
# independent blocks. In which case asymptotic limit for such modes
# can be obtained by dividing above mentioned numbers by AES
# instructions' interleave factor. Westmere can execute at most 3
# instructions' interleave factor. Westmere can execute at most 3
# instructions at a time, meaning that optimal interleave factor is 3,
# and that's where the "magic" number of 1.25 come from. "Optimal
# interleave factor" means that increase of interleave factor does
@ -312,7 +312,7 @@ ___
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
# * Bridge and "super-optimal" for other Intel CPUs...
# * Bridge and "super-optimal" for other Intel CPUs...
sub aesni_generate2 {
my $dir=shift;
@ -1271,7 +1271,7 @@ $code.=<<___;
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
mov OPENSSL_ia32cap_P+4(%rip),%r10d
mov OPENSSL_ia32cap_P+4(%rip),%r10d
xor $key0,%r9d
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
mov %r9d,0x70+12(%rsp)
@ -1551,7 +1551,7 @@ $code.=<<___;
.Lctr32_tail:
# note that at this point $inout0..5 are populated with
# counter values xor-ed with 0-round key
# counter values xor-ed with 0-round key
lea 16($key),$key
cmp \$4,$len
jb .Lctr32_loop3

View file

@ -3773,7 +3773,7 @@ foreach(split("\n",$code)) {
if ($flavour =~ /le$/o) {
SWITCH: for($conv) {
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
}
}

View file

@ -961,21 +961,21 @@ if ($flavour =~ /64/) { ######## 64-bit code
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
"vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
}
sub unvmov32 {
my $arg=shift;
$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {

View file

@ -91,7 +91,7 @@ my @s=@_[12..15];
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
veor @b[2], @b[2], @b[1]

View file

@ -129,7 +129,7 @@ my @s=@_[12..15];
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
pxor @b[6], @b[5]
@ -379,7 +379,7 @@ $code.=<<___;
pxor @s[0], @t[3]
pxor @s[1], @t[2]
pxor @s[2], @t[1]
pxor @s[3], @t[0]
pxor @s[3], @t[0]
#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3

View file

@ -769,7 +769,7 @@ _vpaes_schedule_core:
ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
bl _vpaes_schedule_transform // input transform
mov $inp, #7 // mov \$7, %esi
.Loop_schedule_256:
sub $inp, $inp, #1 // dec %esi
bl _vpaes_schedule_mangle // output low result
@ -778,7 +778,7 @@ _vpaes_schedule_core:
// high round
bl _vpaes_schedule_round
cbz $inp, .Lschedule_mangle_last
bl _vpaes_schedule_mangle
bl _vpaes_schedule_mangle
// low round. swap xmm7 and xmm6
dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
@ -787,7 +787,7 @@ _vpaes_schedule_core:
mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
bl _vpaes_schedule_low_round
mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
b .Loop_schedule_256
##
@ -814,7 +814,7 @@ _vpaes_schedule_core:
.Lschedule_mangle_last_dec:
ld1 {v20.2d-v21.2d}, [x11] // reload constants
sub $out, $out, #16 // add \$-16, %rdx
sub $out, $out, #16 // add \$-16, %rdx
eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
bl _vpaes_schedule_transform // output transform
st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key

View file

@ -1074,7 +1074,7 @@ Loop_schedule_256:
# high round
bl _vpaes_schedule_round
bdz Lschedule_mangle_last # dec %esi
bl _vpaes_schedule_mangle
bl _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6
?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
@ -1082,7 +1082,7 @@ Loop_schedule_256:
vmr v7, v6 # vmovdqa %xmm6, %xmm7
bl _vpaes_schedule_low_round
vmr v7, v5 # vmovdqa %xmm5, %xmm7
b Loop_schedule_256
##
## .aes_schedule_mangle_last
@ -1130,7 +1130,7 @@ Lschedule_mangle_last:
Lschedule_mangle_last_dec:
lvx $iptlo, r11, r12 # reload $ipt
lvx $ipthi, r9, r12
addi $out, $out, -16 # add \$-16, %rdx
addi $out, $out, -16 # add \$-16, %rdx
vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
bl _vpaes_schedule_transform # output transform
@ -1565,7 +1565,7 @@ foreach (split("\n",$code)) {
if ($flavour =~ /le$/o) {
SWITCH: for($conv) {
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
}
}

View file

@ -445,7 +445,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
##
&set_label("schedule_192",16);
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&call ("_vpaes_schedule_transform"); # input transform
&movdqa ("xmm6","xmm0"); # save short part
&pxor ("xmm4","xmm4"); # clear 4
&movhlps("xmm6","xmm4"); # clobber low side with zeros
@ -476,7 +476,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
@ -487,7 +487,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle");
&call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
@ -610,7 +610,7 @@ $k_dsbo=0x2c0; # decryption sbox final output
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
&movdqa ("xmm1","xmm4");
&movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k

View file

@ -171,7 +171,7 @@ _vpaes_encrypt_core:
pshufb %xmm1, %xmm0
ret
.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
##
## Decryption core
##
@ -332,7 +332,7 @@ _vpaes_schedule_core:
##
.Lschedule_128:
mov \$10, %esi
.Loop_schedule_128:
call _vpaes_schedule_round
dec %rsi
@ -366,7 +366,7 @@ _vpaes_schedule_core:
.Loop_schedule_192:
call _vpaes_schedule_round
palignr \$8,%xmm6,%xmm0
palignr \$8,%xmm6,%xmm0
call _vpaes_schedule_mangle # save key n
call _vpaes_schedule_192_smear
call _vpaes_schedule_mangle # save key n+1
@ -392,7 +392,7 @@ _vpaes_schedule_core:
movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
call _vpaes_schedule_transform # input transform
mov \$7, %esi
.Loop_schedule_256:
call _vpaes_schedule_mangle # output low result
movdqa %xmm0, %xmm6 # save cur_lo in xmm6
@ -401,7 +401,7 @@ _vpaes_schedule_core:
call _vpaes_schedule_round
dec %rsi
jz .Lschedule_mangle_last
call _vpaes_schedule_mangle
call _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6
pshufd \$0xFF, %xmm0, %xmm0
@ -409,10 +409,10 @@ _vpaes_schedule_core:
movdqa %xmm6, %xmm7
call _vpaes_schedule_low_round
movdqa %xmm5, %xmm7
jmp .Loop_schedule_256
##
## .aes_schedule_mangle_last
##
@ -511,9 +511,9 @@ _vpaes_schedule_round:
# rotate
pshufd \$0xFF, %xmm0, %xmm0
palignr \$1, %xmm0, %xmm0
# fall through...
# low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round:
# smear xmm7
@ -552,7 +552,7 @@ _vpaes_schedule_low_round:
pxor %xmm4, %xmm0 # 0 = sbox output
# add in smeared stuff
pxor %xmm7, %xmm0
pxor %xmm7, %xmm0
movdqa %xmm0, %xmm7
ret
.size _vpaes_schedule_round,.-_vpaes_schedule_round

View file

@ -36,7 +36,7 @@
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
$flavour = shift;

View file

@ -23,7 +23,7 @@
# [depending on key length, less for longer keys] on ARM920T, and
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
# base and compiler generated code with in-lined umull and even umlal
# instructions. The latter means that this code didn't really have an
# instructions. The latter means that this code didn't really have an
# "advantage" of utilizing some "secret" instruction.
#
# The code is interoperable with Thumb ISA and is rather compact, less

View file

@ -54,7 +54,7 @@ sub bn_mul_add_words
&movd("mm0",&wparam(3)); # mm0 = w
&pxor("mm1","mm1"); # mm1 = carry_in
&jmp(&label("maw_sse2_entry"));
&set_label("maw_sse2_unrolled",16);
&movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
&paddq("mm1","mm3"); # mm1 = carry_in + r[0]
@ -675,20 +675,20 @@ sub bn_sub_part_words
&adc($c,0);
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
}
&comment("");
&add($b,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_neg_loop"));
&set_label("pw_neg_finish",0);
&mov($tmp2,&wparam(4)); # get dl
&mov($num,0);
&sub($num,$tmp2);
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl<0 Tail Round $i");
@ -705,9 +705,9 @@ sub bn_sub_part_words
}
&jmp(&label("pw_end"));
&set_label("pw_pos",0);
&and($num,0xfffffff8); # num / 8
&jz(&label("pw_pos_finish"));
@ -722,18 +722,18 @@ sub bn_sub_part_words
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&jnc(&label("pw_nc".$i));
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_pos_loop"));
&set_label("pw_pos_finish",0);
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_end"));
for ($i=0; $i<7; $i++)
{
&comment("dl>0 Tail Round $i");
@ -754,17 +754,17 @@ sub bn_sub_part_words
&mov(&DWP($i*4,$r,"",0),$tmp1); # *r
&set_label("pw_nc".$i,0);
}
&comment("");
&add($a,32);
&add($r,32);
&sub($num,8);
&jnz(&label("pw_nc_loop"));
&mov($num,&wparam(4)); # get dl
&and($num,7);
&jz(&label("pw_nc_end"));
for ($i=0; $i<7; $i++)
{
&mov($tmp1,&DWP($i*4,$a,"",0)); # *a

View file

@ -47,7 +47,7 @@ sub mul_add_c
&mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
}
@ -76,7 +76,7 @@ sub sqr_add_c
&mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
###
&adc($c2,0);
# is pos > 1, it means it is the last loop
# is pos > 1, it means it is the last loop
&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
}
@ -127,7 +127,7 @@ sub bn_mul_comba
$c2="ebp";
$a="esi";
$b="edi";
$as=0;
$ae=0;
$bs=0;
@ -142,9 +142,9 @@ sub bn_mul_comba
&push("ebx");
&xor($c0,$c0);
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&mov("eax",&DWP(0,$a,"",0)); # load the first word
&xor($c1,$c1);
&mov("edx",&DWP(0,$b,"",0)); # load the first second
&mov("edx",&DWP(0,$b,"",0)); # load the first second
for ($i=0; $i<$tot; $i++)
{
@ -152,7 +152,7 @@ sub bn_mul_comba
$bi=$bs;
$end=$be+1;
&comment("################## Calculate word $i");
&comment("################## Calculate word $i");
for ($j=$bs; $j<$end; $j++)
{

View file

@ -80,7 +80,7 @@ $code=<<___;
// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
// const BN_ULONG *bp,const BN_ULONG *np,
// const BN_ULONG *n0p,int num);
// const BN_ULONG *n0p,int num);
.align 64
.global bn_mul_mont#
.proc bn_mul_mont#
@ -203,7 +203,7 @@ bn_mul_mont_general:
{ .mmi; .pred.rel "mutex",p39,p41
(p39) add topbit=r0,r0
(p41) add topbit=r0,r0,1
nop.i 0 }
nop.i 0 }
{ .mmi; st8 [tp_1]=n[0]
add tptr=16,sp
add tp_1=8,sp };;

View file

@ -603,13 +603,13 @@ $code.=<<___;
sltu $v0,$t2,$ta2
$ST $t2,-2*$BNSZ($a0)
$ADDU $v0,$t8
$ADDU $ta3,$t3
sltu $t9,$ta3,$t3
$ADDU $t3,$ta3,$v0
sltu $v0,$t3,$ta3
$ST $t3,-$BNSZ($a0)
.set noreorder
bgtz $at,.L_bn_add_words_loop
$ADDU $v0,$t9
@ -808,7 +808,7 @@ bn_div_3_words:
# so that we can save two arguments
# and return address in registers
# instead of stack:-)
$LD $a0,($a3)
move $ta2,$a1
bne $a0,$a2,bn_div_3_words_internal

View file

@ -546,7 +546,7 @@ L\$copy
ldd $idx($np),$hi0
std,ma %r0,8($tp)
addib,<> 8,$idx,.-8 ; L\$copy
std,ma $hi0,8($rp)
std,ma $hi0,8($rp)
___
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
@ -868,7 +868,7 @@ L\$copy_pa11
ldwx $idx($np),$hi0
stws,ma %r0,4($tp)
addib,<> 4,$idx,L\$copy_pa11
stws,ma $hi0,4($rp)
stws,ma $hi0,4($rp)
nop ; alignment
L\$done

View file

@ -26,7 +26,7 @@
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
# for 64-bit application running on PPC970/G5 is:
#
# 512-bit +65%
# 512-bit +65%
# 1024-bit +35%
# 2048-bit +18%
# 4096-bit +4%
@ -49,7 +49,7 @@ if ($flavour =~ /32/) {
$UMULL= "mullw"; # unsigned multiply low
$UMULH= "mulhwu"; # unsigned multiply high
$UCMP= "cmplw"; # unsigned compare
$SHRI= "srwi"; # unsigned shift right by immediate
$SHRI= "srwi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} elsif ($flavour =~ /64/) {
@ -69,7 +69,7 @@ if ($flavour =~ /32/) {
$UMULL= "mulld"; # unsigned multiply low
$UMULH= "mulhdu"; # unsigned multiply high
$UCMP= "cmpld"; # unsigned compare
$SHRI= "srdi"; # unsigned shift right by immediate
$SHRI= "srdi"; # unsigned shift right by immediate
$PUSH= $ST;
$POP= $LD;
} else { die "nonsense $flavour"; }

View file

@ -38,7 +38,7 @@
#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
#
# Same bechmark with this assembler code:
#
@ -74,7 +74,7 @@
#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
#
#
# Again, performance increases by at about 75%
#
# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
@ -125,7 +125,7 @@ if ($flavour =~ /32/) {
$CNTLZ= "cntlzw"; # count leading zeros
$SHL= "slw"; # shift left
$SHR= "srw"; # unsigned shift right
$SHRI= "srwi"; # unsigned shift right by immediate
$SHRI= "srwi"; # unsigned shift right by immediate
$SHLI= "slwi"; # shift left by immediate
$CLRU= "clrlwi"; # clear upper bits
$INSR= "insrwi"; # insert right
@ -149,10 +149,10 @@ if ($flavour =~ /32/) {
$CNTLZ= "cntlzd"; # count leading zeros
$SHL= "sld"; # shift left
$SHR= "srd"; # unsigned shift right
$SHRI= "srdi"; # unsigned shift right by immediate
$SHRI= "srdi"; # unsigned shift right by immediate
$SHLI= "sldi"; # shift left by immediate
$CLRU= "clrldi"; # clear upper bits
$INSR= "insrdi"; # insert right
$INSR= "insrdi"; # insert right
$ROTL= "rotldi"; # rotate left by immediate
$TR= "td"; # conditional trap
} else { die "nonsense $flavour"; }
@ -189,7 +189,7 @@ $data=<<EOF;
# below.
# 12/05/03 Suresh Chari
# (with lots of help from) Andy Polyakov
##
##
# 1. Initial version 10/20/02 Suresh Chari
#
#
@ -202,7 +202,7 @@ $data=<<EOF;
# be done in the build process.
#
# Hand optimized assembly code for the following routines
#
#
# bn_sqr_comba4
# bn_sqr_comba8
# bn_mul_comba4
@ -225,10 +225,10 @@ $data=<<EOF;
#--------------------------------------------------------------------------
#
# Defines to be used in the assembly code.
#
#
#.set r0,0 # we use it as storage for value of 0
#.set SP,1 # preserved
#.set RTOC,2 # preserved
#.set RTOC,2 # preserved
#.set r3,3 # 1st argument/return value
#.set r4,4 # 2nd argument/volatile register
#.set r5,5 # 3rd argument/volatile register
@ -246,7 +246,7 @@ $data=<<EOF;
# the first . i.e. for example change ".bn_sqr_comba4"
# to "bn_sqr_comba4". This should be automatically done
# in the build.
.globl .bn_sqr_comba4
.globl .bn_sqr_comba8
.globl .bn_mul_comba4
@ -257,9 +257,9 @@ $data=<<EOF;
.globl .bn_sqr_words
.globl .bn_mul_words
.globl .bn_mul_add_words
# .text section
.machine "any"
#
@ -278,8 +278,8 @@ $data=<<EOF;
# r3 contains r
# r4 contains a
#
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
#
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
#
# r5,r6 are the two BN_ULONGs being multiplied.
# r7,r8 are the results of the 32x32 giving 64 bit multiply.
# r9,r10, r11 are the equivalents of c1,c2, c3.
@ -288,10 +288,10 @@ $data=<<EOF;
#
xor r0,r0,r0 # set r0 = 0. Used in the addze
# instructions below
#sqr_add_c(a,0,c1,c2,c3)
$LD r5,`0*$BNSZ`(r4)
$UMULL r9,r5,r5
$LD r5,`0*$BNSZ`(r4)
$UMULL r9,r5,r5
$UMULH r10,r5,r5 #in first iteration. No need
#to add since c1=c2=c3=0.
# Note c3(r11) is NOT set to 0
@ -299,20 +299,20 @@ $data=<<EOF;
$ST r9,`0*$BNSZ`(r3) # r[0]=c1;
# sqr_add_c2(a,1,0,c2,c3,c1);
$LD r6,`1*$BNSZ`(r4)
$LD r6,`1*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
adde r8,r8,r8
addze r9,r0 # catch carry if any.
# r9= r0(=0) and carry
# r9= r0(=0) and carry
addc r10,r7,r10 # now add to temp result.
addze r11,r8 # r8 added to r11 which is 0
addze r11,r8 # r8 added to r11 which is 0
addze r9,r9
$ST r10,`1*$BNSZ`(r3) #r[1]=c2;
$ST r10,`1*$BNSZ`(r3) #r[1]=c2;
#sqr_add_c(a,1,c3,c1,c2)
$UMULL r7,r6,r6
$UMULH r8,r6,r6
@ -323,23 +323,23 @@ $data=<<EOF;
$LD r6,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r10,r10
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
$ST r11,`2*$BNSZ`(r3) #r[2]=c3
$ST r11,`2*$BNSZ`(r3) #r[2]=c3
#sqr_add_c2(a,3,0,c1,c2,c3);
$LD r6,`3*$BNSZ`(r4)
$LD r6,`3*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r11,r0
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
@ -348,7 +348,7 @@ $data=<<EOF;
$LD r6,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r11,r11
@ -363,31 +363,31 @@ $data=<<EOF;
adde r11,r8,r11
addze r9,r0
#sqr_add_c2(a,3,1,c2,c3,c1);
$LD r6,`3*$BNSZ`(r4)
$LD r6,`3*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r9,r9
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
$ST r10,`4*$BNSZ`(r3) #r[4]=c2
#sqr_add_c2(a,3,2,c3,c1,c2);
$LD r5,`2*$BNSZ`(r4)
$LD r5,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r7,r7,r7
adde r8,r8,r8
addze r10,r0
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
$ST r11,`5*$BNSZ`(r3) #r[5] = c3
#sqr_add_c(a,3,c1,c2,c3);
$UMULL r7,r6,r6
$UMULL r7,r6,r6
$UMULH r8,r6,r6
addc r9,r7,r9
adde r10,r8,r10
@ -406,7 +406,7 @@ $data=<<EOF;
# for the gcc compiler. This should be automatically
# done in the build
#
.align 4
.bn_sqr_comba8:
#
@ -418,15 +418,15 @@ $data=<<EOF;
# r3 contains r
# r4 contains a
#
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
#
# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
#
# r5,r6 are the two BN_ULONGs being multiplied.
# r7,r8 are the results of the 32x32 giving 64 bit multiply.
# r9,r10, r11 are the equivalents of c1,c2, c3.
#
# Possible optimization of loading all 8 longs of a into registers
# doesn't provide any speedup
#
#
xor r0,r0,r0 #set r0 = 0.Used in addze
#instructions below.
@ -439,18 +439,18 @@ $data=<<EOF;
#sqr_add_c2(a,1,0,c2,c3,c1);
$LD r6,`1*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
$UMULH r8,r5,r6
addc r10,r7,r10 #add the two register number
adde r11,r8,r0 # (r8,r7) to the three register
addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
addc r10,r7,r10 #add the two register number
adde r11,r8,r11 # (r8,r7) to the three register
addze r9,r9 # number (r9,r11,r10).
$ST r10,`1*$BNSZ`(r3) # r[1]=c2
#sqr_add_c(a,1,c3,c1,c2);
$UMULL r7,r6,r6
$UMULH r8,r6,r6
@ -461,25 +461,25 @@ $data=<<EOF;
$LD r6,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
$ST r11,`2*$BNSZ`(r3) #r[2]=c3
#sqr_add_c2(a,3,0,c1,c2,c3);
$LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r9,r7,r9
adde r10,r8,r10
addze r11,r0
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
@ -488,20 +488,20 @@ $data=<<EOF;
$LD r6,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
$ST r9,`3*$BNSZ`(r3) #r[3]=c1;
#sqr_add_c(a,2,c2,c3,c1);
$UMULL r7,r6,r6
$UMULH r8,r6,r6
addc r10,r7,r10
adde r11,r8,r11
addze r9,r0
@ -509,11 +509,11 @@ $data=<<EOF;
$LD r6,`3*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
@ -522,11 +522,11 @@ $data=<<EOF;
$LD r6,`4*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
@ -535,11 +535,11 @@ $data=<<EOF;
$LD r6,`5*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r11,r7,r11
adde r9,r8,r9
addze r10,r0
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
@ -548,11 +548,11 @@ $data=<<EOF;
$LD r6,`4*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
@ -561,11 +561,11 @@ $data=<<EOF;
$LD r6,`3*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
@ -580,11 +580,11 @@ $data=<<EOF;
$LD r6,`4*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
@ -593,11 +593,11 @@ $data=<<EOF;
$LD r6,`5*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
addc r9,r7,r9
adde r10,r8,r10
addze r11,r11
@ -617,7 +617,7 @@ $data=<<EOF;
$LD r6,`7*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r10,r7,r10
adde r11,r8,r11
addze r9,r0
@ -629,7 +629,7 @@ $data=<<EOF;
$LD r6,`6*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
@ -652,7 +652,7 @@ $data=<<EOF;
$LD r6,`4*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r10,r7,r10
adde r11,r8,r11
addze r9,r9
@ -684,7 +684,7 @@ $data=<<EOF;
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
addc r11,r7,r11
adde r9,r8,r9
addze r10,r10
@ -704,7 +704,7 @@ $data=<<EOF;
$LD r5,`2*$BNSZ`(r4)
$UMULL r7,r5,r6
$UMULH r8,r5,r6
addc r9,r7,r9
adde r10,r8,r10
addze r11,r0
@ -801,7 +801,7 @@ $data=<<EOF;
adde r10,r8,r10
addze r11,r11
$ST r9,`12*$BNSZ`(r3) #r[12]=c1;
#sqr_add_c2(a,7,6,c2,c3,c1)
$LD r5,`6*$BNSZ`(r4)
$UMULL r7,r5,r6
@ -850,21 +850,21 @@ $data=<<EOF;
#
xor r0,r0,r0 #r0=0. Used in addze below.
#mul_add_c(a[0],b[0],c1,c2,c3);
$LD r6,`0*$BNSZ`(r4)
$LD r7,`0*$BNSZ`(r5)
$UMULL r10,r6,r7
$UMULH r11,r6,r7
$LD r6,`0*$BNSZ`(r4)
$LD r7,`0*$BNSZ`(r5)
$UMULL r10,r6,r7
$UMULH r11,r6,r7
$ST r10,`0*$BNSZ`(r3) #r[0]=c1
#mul_add_c(a[0],b[1],c2,c3,c1);
$LD r7,`1*$BNSZ`(r5)
$LD r7,`1*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r11,r8,r11
adde r12,r9,r0
addze r10,r0
#mul_add_c(a[1],b[0],c2,c3,c1);
$LD r6, `1*$BNSZ`(r4)
$LD r7, `0*$BNSZ`(r5)
$LD r6, `1*$BNSZ`(r4)
$LD r7, `0*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r11,r8,r11
@ -872,23 +872,23 @@ $data=<<EOF;
addze r10,r10
$ST r11,`1*$BNSZ`(r3) #r[1]=c2
#mul_add_c(a[2],b[0],c3,c1,c2);
$LD r6,`2*$BNSZ`(r4)
$LD r6,`2*$BNSZ`(r4)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
adde r10,r9,r10
addze r11,r0
#mul_add_c(a[1],b[1],c3,c1,c2);
$LD r6,`1*$BNSZ`(r4)
$LD r7,`1*$BNSZ`(r5)
$LD r6,`1*$BNSZ`(r4)
$LD r7,`1*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
adde r10,r9,r10
addze r11,r11
#mul_add_c(a[0],b[2],c3,c1,c2);
$LD r6,`0*$BNSZ`(r4)
$LD r7,`2*$BNSZ`(r5)
$LD r6,`0*$BNSZ`(r4)
$LD r7,`2*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
@ -896,7 +896,7 @@ $data=<<EOF;
addze r11,r11
$ST r12,`2*$BNSZ`(r3) #r[2]=c3
#mul_add_c(a[0],b[3],c1,c2,c3);
$LD r7,`3*$BNSZ`(r5)
$LD r7,`3*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r10,r8,r10
@ -928,7 +928,7 @@ $data=<<EOF;
addze r12,r12
$ST r10,`3*$BNSZ`(r3) #r[3]=c1
#mul_add_c(a[3],b[1],c2,c3,c1);
$LD r7,`1*$BNSZ`(r5)
$LD r7,`1*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r11,r8,r11
@ -952,7 +952,7 @@ $data=<<EOF;
addze r10,r10
$ST r11,`4*$BNSZ`(r3) #r[4]=c2
#mul_add_c(a[2],b[3],c3,c1,c2);
$LD r6,`2*$BNSZ`(r4)
$LD r6,`2*$BNSZ`(r4)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r12,r8,r12
@ -968,7 +968,7 @@ $data=<<EOF;
addze r11,r11
$ST r12,`5*$BNSZ`(r3) #r[5]=c3
#mul_add_c(a[3],b[3],c1,c2,c3);
$LD r7,`3*$BNSZ`(r5)
$LD r7,`3*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
addc r10,r8,r10
@ -988,7 +988,7 @@ $data=<<EOF;
# for the gcc compiler. This should be automatically
# done in the build
#
.align 4
.bn_mul_comba8:
#
@ -1003,7 +1003,7 @@ $data=<<EOF;
# r10, r11, r12 are the equivalents of c1, c2, and c3.
#
xor r0,r0,r0 #r0=0. Used in addze below.
#mul_add_c(a[0],b[0],c1,c2,c3);
$LD r6,`0*$BNSZ`(r4) #a[0]
$LD r7,`0*$BNSZ`(r5) #b[0]
@ -1065,7 +1065,7 @@ $data=<<EOF;
addc r10,r10,r8
adde r11,r11,r9
addze r12,r12
#mul_add_c(a[2],b[1],c1,c2,c3);
$LD r6,`2*$BNSZ`(r4)
$LD r7,`1*$BNSZ`(r5)
@ -1131,7 +1131,7 @@ $data=<<EOF;
adde r10,r10,r9
addze r11,r0
#mul_add_c(a[1],b[4],c3,c1,c2);
$LD r6,`1*$BNSZ`(r4)
$LD r6,`1*$BNSZ`(r4)
$LD r7,`4*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
@ -1139,7 +1139,7 @@ $data=<<EOF;
adde r10,r10,r9
addze r11,r11
#mul_add_c(a[2],b[3],c3,c1,c2);
$LD r6,`2*$BNSZ`(r4)
$LD r6,`2*$BNSZ`(r4)
$LD r7,`3*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
@ -1147,7 +1147,7 @@ $data=<<EOF;
adde r10,r10,r9
addze r11,r11
#mul_add_c(a[3],b[2],c3,c1,c2);
$LD r6,`3*$BNSZ`(r4)
$LD r6,`3*$BNSZ`(r4)
$LD r7,`2*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
@ -1155,7 +1155,7 @@ $data=<<EOF;
adde r10,r10,r9
addze r11,r11
#mul_add_c(a[4],b[1],c3,c1,c2);
$LD r6,`4*$BNSZ`(r4)
$LD r6,`4*$BNSZ`(r4)
$LD r7,`1*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
@ -1163,7 +1163,7 @@ $data=<<EOF;
adde r10,r10,r9
addze r11,r11
#mul_add_c(a[5],b[0],c3,c1,c2);
$LD r6,`5*$BNSZ`(r4)
$LD r6,`5*$BNSZ`(r4)
$LD r7,`0*$BNSZ`(r5)
$UMULL r8,r6,r7
$UMULH r9,r6,r7
@ -1555,7 +1555,7 @@ $data=<<EOF;
addi r3,r3,-$BNSZ
addi r5,r5,-$BNSZ
mtctr r6
Lppcasm_sub_mainloop:
Lppcasm_sub_mainloop:
$LDU r7,$BNSZ(r4)
$LDU r8,$BNSZ(r5)
subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
@ -1563,7 +1563,7 @@ Lppcasm_sub_mainloop:
# is r7-r8 -1 as we need.
$STU r6,$BNSZ(r3)
bdnz Lppcasm_sub_mainloop
Lppcasm_sub_adios:
Lppcasm_sub_adios:
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
andi. r3,r3,1 # keep only last bit.
blr
@ -1604,13 +1604,13 @@ Lppcasm_sub_adios:
addi r3,r3,-$BNSZ
addi r5,r5,-$BNSZ
mtctr r6
Lppcasm_add_mainloop:
Lppcasm_add_mainloop:
$LDU r7,$BNSZ(r4)
$LDU r8,$BNSZ(r5)
adde r8,r7,r8
$STU r8,$BNSZ(r3)
bdnz Lppcasm_add_mainloop
Lppcasm_add_adios:
Lppcasm_add_adios:
addze r3,r0 #return carry bit.
blr
.long 0
@ -1633,11 +1633,11 @@ Lppcasm_add_adios:
# the PPC instruction to count leading zeros instead
# of call to num_bits_word. Since this was compiled
# only at level -O2 we can possibly squeeze it more?
#
#
# r3 = h
# r4 = l
# r5 = d
$UCMPI 0,r5,0 # compare r5 and 0
bne Lppcasm_div1 # proceed if d!=0
li r3,-1 # d=0 return -1
@ -1653,7 +1653,7 @@ Lppcasm_div1:
Lppcasm_div2:
$UCMP 0,r3,r5 #h>=d?
blt Lppcasm_div3 #goto Lppcasm_div3 if not
subf r3,r5,r3 #h-=d ;
subf r3,r5,r3 #h-=d ;
Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
cmpi 0,0,r7,0 # is (i == 0)?
beq Lppcasm_div4
@ -1668,7 +1668,7 @@ Lppcasm_div4:
# as it saves registers.
li r6,2 #r6=2
mtctr r6 #counter will be in count.
Lppcasm_divouterloop:
Lppcasm_divouterloop:
$SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
$SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
# compute here for innerloop.
@ -1676,7 +1676,7 @@ Lppcasm_divouterloop:
bne Lppcasm_div5 # goto Lppcasm_div5 if not
li r8,-1
$CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
$CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
b Lppcasm_div6
Lppcasm_div5:
$UDIV r8,r3,r9 #q = h/dh
@ -1684,7 +1684,7 @@ Lppcasm_div6:
$UMULL r12,r9,r8 #th = q*dh
$CLRU r10,r5,`$BITS/2` #r10=dl
$UMULL r6,r8,r10 #tl = q*dl
Lppcasm_divinnerloop:
subf r10,r12,r3 #t = h -th
$SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
@ -1761,7 +1761,7 @@ Lppcasm_div9:
addi r4,r4,-$BNSZ
addi r3,r3,-$BNSZ
mtctr r5
Lppcasm_sqr_mainloop:
Lppcasm_sqr_mainloop:
#sqr(r[0],r[1],a[0]);
$LDU r6,$BNSZ(r4)
$UMULL r7,r6,r6
@ -1769,7 +1769,7 @@ Lppcasm_sqr_mainloop:
$STU r7,$BNSZ(r3)
$STU r8,$BNSZ(r3)
bdnz Lppcasm_sqr_mainloop
Lppcasm_sqr_adios:
Lppcasm_sqr_adios:
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
@ -1783,7 +1783,7 @@ Lppcasm_sqr_adios:
# done in the build
#
.align 4
.align 4
.bn_mul_words:
#
# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
@ -1797,7 +1797,7 @@ Lppcasm_sqr_adios:
rlwinm. r7,r5,30,2,31 # num >> 2
beq Lppcasm_mw_REM
mtctr r7
Lppcasm_mw_LOOP:
Lppcasm_mw_LOOP:
#mul(rp[0],ap[0],w,c1);
$LD r8,`0*$BNSZ`(r4)
$UMULL r9,r6,r8
@ -1809,7 +1809,7 @@ Lppcasm_mw_LOOP:
#using adde.
$ST r9,`0*$BNSZ`(r3)
#mul(rp[1],ap[1],w,c1);
$LD r8,`1*$BNSZ`(r4)
$LD r8,`1*$BNSZ`(r4)
$UMULL r11,r6,r8
$UMULH r12,r6,r8
adde r11,r11,r10
@ -1830,7 +1830,7 @@ Lppcasm_mw_LOOP:
addze r12,r12 #this spin we collect carry into
#r12
$ST r11,`3*$BNSZ`(r3)
addi r3,r3,`4*$BNSZ`
addi r4,r4,`4*$BNSZ`
bdnz Lppcasm_mw_LOOP
@ -1846,25 +1846,25 @@ Lppcasm_mw_REM:
addze r10,r10
$ST r9,`0*$BNSZ`(r3)
addi r12,r10,0
addi r5,r5,-1
cmpli 0,0,r5,0
beq Lppcasm_mw_OVER
#mul(rp[1],ap[1],w,c1);
$LD r8,`1*$BNSZ`(r4)
$LD r8,`1*$BNSZ`(r4)
$UMULL r9,r6,r8
$UMULH r10,r6,r8
addc r9,r9,r12
addze r10,r10
$ST r9,`1*$BNSZ`(r3)
addi r12,r10,0
addi r5,r5,-1
cmpli 0,0,r5,0
beq Lppcasm_mw_OVER
#mul_add(rp[2],ap[2],w,c1);
$LD r8,`2*$BNSZ`(r4)
$UMULL r9,r6,r8
@ -1873,8 +1873,8 @@ Lppcasm_mw_REM:
addze r10,r10
$ST r9,`2*$BNSZ`(r3)
addi r12,r10,0
Lppcasm_mw_OVER:
Lppcasm_mw_OVER:
addi r3,r12,0
blr
.long 0
@ -1902,11 +1902,11 @@ Lppcasm_mw_OVER:
# empirical evidence suggests that unrolled version performs best!!
#
xor r0,r0,r0 #r0 = 0
xor r12,r12,r12 #r12 = 0 . used for carry
xor r12,r12,r12 #r12 = 0 . used for carry
rlwinm. r7,r5,30,2,31 # num >> 2
beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
mtctr r7
Lppcasm_maw_mainloop:
Lppcasm_maw_mainloop:
#mul_add(rp[0],ap[0],w,c1);
$LD r8,`0*$BNSZ`(r4)
$LD r11,`0*$BNSZ`(r3)
@ -1922,9 +1922,9 @@ Lppcasm_maw_mainloop:
#by multiply and will be collected
#in the next spin
$ST r9,`0*$BNSZ`(r3)
#mul_add(rp[1],ap[1],w,c1);
$LD r8,`1*$BNSZ`(r4)
$LD r8,`1*$BNSZ`(r4)
$LD r9,`1*$BNSZ`(r3)
$UMULL r11,r6,r8
$UMULH r12,r6,r8
@ -1933,7 +1933,7 @@ Lppcasm_maw_mainloop:
addc r11,r11,r9
#addze r12,r12
$ST r11,`1*$BNSZ`(r3)
#mul_add(rp[2],ap[2],w,c1);
$LD r8,`2*$BNSZ`(r4)
$UMULL r9,r6,r8
@ -1944,7 +1944,7 @@ Lppcasm_maw_mainloop:
addc r9,r9,r11
#addze r10,r10
$ST r9,`2*$BNSZ`(r3)
#mul_add(rp[3],ap[3],w,c1);
$LD r8,`3*$BNSZ`(r4)
$UMULL r11,r6,r8
@ -1958,7 +1958,7 @@ Lppcasm_maw_mainloop:
addi r3,r3,`4*$BNSZ`
addi r4,r4,`4*$BNSZ`
bdnz Lppcasm_maw_mainloop
Lppcasm_maw_leftover:
andi. r5,r5,0x3
beq Lppcasm_maw_adios
@ -1975,10 +1975,10 @@ Lppcasm_maw_leftover:
addc r9,r9,r12
addze r12,r10
$ST r9,0(r3)
bdz Lppcasm_maw_adios
#mul_add(rp[1],ap[1],w,c1);
$LDU r8,$BNSZ(r4)
$LDU r8,$BNSZ(r4)
$UMULL r9,r6,r8
$UMULH r10,r6,r8
$LDU r11,$BNSZ(r3)
@ -1987,7 +1987,7 @@ Lppcasm_maw_leftover:
addc r9,r9,r12
addze r12,r10
$ST r9,0(r3)
bdz Lppcasm_maw_adios
#mul_add(rp[2],ap[2],w,c1);
$LDU r8,$BNSZ(r4)
@ -1999,8 +1999,8 @@ Lppcasm_maw_leftover:
addc r9,r9,r12
addze r12,r10
$ST r9,0(r3)
Lppcasm_maw_adios:
Lppcasm_maw_adios:
addi r3,r12,0
blr
.long 0

View file

@ -382,7 +382,7 @@ $code.=<<___;
vpaddq $TEMP1, $ACC1, $ACC1
vpmuludq 32*7-128($aap), $B2, $ACC2
vpbroadcastq 32*5-128($tpa), $B2
vpaddq 32*11-448($tp1), $ACC2, $ACC2
vpaddq 32*11-448($tp1), $ACC2, $ACC2
vmovdqu $ACC6, 32*6-192($tp0)
vmovdqu $ACC7, 32*7-192($tp0)
@ -441,7 +441,7 @@ $code.=<<___;
vmovdqu $ACC7, 32*16-448($tp1)
lea 8($tp1), $tp1
dec $i
dec $i
jnz .LOOP_SQR_1024
___
$ZERO = $ACC9;
@ -786,7 +786,7 @@ $code.=<<___;
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
vpaddq $TEMP3, $ACC7, $ACC7
vpaddq $TEMP4, $ACC8, $ACC8
vpsrlq \$29, $ACC4, $TEMP1
vpand $AND_MASK, $ACC4, $ACC4
vpsrlq \$29, $ACC5, $TEMP2
@ -1451,7 +1451,7 @@ $code.=<<___;
vpaddq $TEMP4, $ACC8, $ACC8
vmovdqu $ACC4, 128-128($rp)
vmovdqu $ACC5, 160-128($rp)
vmovdqu $ACC5, 160-128($rp)
vmovdqu $ACC6, 192-128($rp)
vmovdqu $ACC7, 224-128($rp)
vmovdqu $ACC8, 256-128($rp)

View file

@ -282,9 +282,9 @@ $code.=<<___;
movq %r9, 16(%rsp)
movq %r10, 24(%rsp)
shrq \$63, %rbx
#third iteration
movq 16($inp), %r9
movq 16($inp), %r9
movq 24($inp), %rax
mulq %r9
addq %rax, %r12
@ -532,7 +532,7 @@ $code.=<<___;
movl $times,128+8(%rsp)
movq $out, %xmm0 # off-load
movq %rbp, %xmm1 # off-load
#first iteration
#first iteration
mulx %rax, %r8, %r9
mulx 16($inp), %rcx, %r10
@ -568,7 +568,7 @@ $code.=<<___;
mov %rax, (%rsp)
mov %r8, 8(%rsp)
#second iteration
#second iteration
mulx 16($inp), %rax, %rbx
adox %rax, %r10
adcx %rbx, %r11
@ -607,8 +607,8 @@ $code.=<<___;
mov %r9, 16(%rsp)
.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
#third iteration
#third iteration
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
adox $out, %r12
adcx %r9, %r13
@ -643,8 +643,8 @@ $code.=<<___;
mov %r11, 32(%rsp)
.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
#fourth iteration
#fourth iteration
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
adox %rax, %r14
adcx %rbx, %r15
@ -676,8 +676,8 @@ $code.=<<___;
mov %r13, 48(%rsp)
mov %r14, 56(%rsp)
#fifth iteration
#fifth iteration
.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
adox $out, %r8
adcx %r11, %r9
@ -704,8 +704,8 @@ $code.=<<___;
mov %r15, 64(%rsp)
mov %r8, 72(%rsp)
#sixth iteration
#sixth iteration
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
adox %rax, %r10
adcx %rbx, %r11
@ -1048,7 +1048,7 @@ $code.=<<___;
movq 56($ap), %rax
movq %rdx, %r14
adcq \$0, %r14
mulq %rbx
addq %rax, %r14
movq ($ap), %rax
@ -1150,7 +1150,7 @@ $code.=<<___;
movq ($ap), %rax
adcq \$0, %rdx
addq %r15, %r14
movq %rdx, %r15
movq %rdx, %r15
adcq \$0, %r15
leaq 8(%rdi), %rdi
@ -1212,7 +1212,7 @@ $code.=<<___ if ($addx);
mulx 48($ap), %rbx, %r14
adcx %rax, %r12
mulx 56($ap), %rax, %r15
adcx %rbx, %r13
adcx %rax, %r14
@ -1411,7 +1411,7 @@ $code.=<<___;
___
$code.=<<___ if ($addx);
jmp .Lmul_scatter_tail
.align 32
.Lmulx_scatter:
movq ($out), %rdx # pass b[0]
@ -1824,7 +1824,7 @@ __rsaz_512_mul:
movq 56($ap), %rax
movq %rdx, %r14
adcq \$0, %r14
mulq %rbx
addq %rax, %r14
movq ($ap), %rax
@ -1901,7 +1901,7 @@ __rsaz_512_mul:
movq ($ap), %rax
adcq \$0, %rdx
addq %r15, %r14
movq %rdx, %r15
movq %rdx, %r15
adcq \$0, %r15
leaq 8(%rdi), %rdi

View file

@ -198,7 +198,7 @@ $code.=<<___;
xgr $hi,@r[1]
xgr $lo,@r[0]
xgr $hi,@r[2]
xgr $lo,@r[3]
xgr $lo,@r[3]
xgr $hi,@r[3]
xgr $lo,$hi
stg $hi,16($rp)

View file

@ -76,7 +76,7 @@
# dsa 1024 bits 0.001346s 0.001595s 742.7 627.0
# dsa 2048 bits 0.004745s 0.005582s 210.7 179.1
#
# Conclusions:
# Conclusions:
# - VIA SDK leaves a *lot* of room for improvement (which this
# implementation successfully fills:-);
# - 'rep montmul' gives up to >3x performance improvement depending on

View file

@ -39,7 +39,7 @@ require "x86asm.pl";
$output = pop;
open STDOUT,">$output";
&asm_init($ARGV[0],$0);
$sse2=0;

View file

@ -1049,7 +1049,7 @@ my $bptr="%rdx"; # const void *table,
my $nptr="%rcx"; # const BN_ULONG *nptr,
my $n0 ="%r8"; # const BN_ULONG *n0);
my $num ="%r9"; # int num, has to be divisible by 8
# int pwr
# int pwr
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
my @A0=("%r10","%r11");
@ -1126,7 +1126,7 @@ $code.=<<___;
ja .Lpwr_page_walk
.Lpwr_page_walk_done:
mov $num,%r10
mov $num,%r10
neg $num
##############################################################
@ -2036,7 +2036,7 @@ __bn_post4x_internal:
jnz .Lsqr4x_sub
mov $num,%r10 # prepare for back-to-back call
neg $num # restore $num
neg $num # restore $num
ret
.size __bn_post4x_internal,.-__bn_post4x_internal
___
@ -2259,7 +2259,7 @@ bn_mulx4x_mont_gather5:
mov \$0,%r10
cmovc %r10,%r11
sub %r11,%rbp
.Lmulx4xsp_done:
.Lmulx4xsp_done:
and \$-64,%rbp # ensure alignment
mov %rsp,%r11
sub %rbp,%r11
@ -2741,7 +2741,7 @@ bn_powerx5:
ja .Lpwrx_page_walk
.Lpwrx_page_walk_done:
mov $num,%r10
mov $num,%r10
neg $num
##############################################################

View file

@ -792,9 +792,9 @@ if ($OPENSSL) {
64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
&set_label("Camellia_SIGMA",64);
&data_word(

View file

@ -7,7 +7,7 @@
# https://www.openssl.org/source/license.html
# This flag makes the inner loop one cycle longer, but generates
# This flag makes the inner loop one cycle longer, but generates
# code that runs %30 faster on the pentium pro/II, 44% faster
# of PIII, while only %7 slower on the pentium.
# By default, this flag is on.
@ -157,7 +157,7 @@ sub E_CAST {
if ($ppro) {
&xor( $tmp1, $tmp1);
&mov( $tmp2, 0xff);
&movb( &LB($tmp1), &HB($tmp4)); # A
&and( $tmp2, $tmp4);
@ -166,7 +166,7 @@ sub E_CAST {
} else {
&mov( $tmp2, $tmp4); # B
&movb( &LB($tmp1), &HB($tmp4)); # A # BAD BAD BAD
&shr( $tmp4, 16); #
&and( $tmp2, 0xff);
}

View file

@ -15,7 +15,7 @@
# ====================================================================
#
# December 2014
#
#
# ChaCha20 for ARMv4.
#
# Performance in cycles per byte out of large buffer.
@ -720,7 +720,7 @@ ChaCha20_neon:
vadd.i32 $d2,$d1,$t0 @ counter+2
str @t[3], [sp,#4*(16+15)]
mov @t[3],#10
add @x[12],@x[12],#3 @ counter+3
add @x[12],@x[12],#3 @ counter+3
b .Loop_neon
.align 4

View file

@ -15,7 +15,7 @@
# ====================================================================
#
# June 2015
#
#
# ChaCha20 for ARMv8.
#
# Performance in cycles per byte out of large buffer.
@ -201,7 +201,7 @@ ChaCha20_ctr32:
mov $ctr,#10
subs $len,$len,#64
.Loop:
sub $ctr,$ctr,#1
sub $ctr,$ctr,#1
___
foreach (&ROUND(0, 4, 8,12)) { eval; }
foreach (&ROUND(0, 5,10,15)) { eval; }

View file

@ -15,7 +15,7 @@
# ====================================================================
#
# October 2015
#
#
# ChaCha20 for PowerPC/AltiVec.
#
# Performance in cycles per byte out of large buffer.
@ -524,7 +524,7 @@ $code.=<<___;
lwz @d[3],12($ctr)
vadduwm @K[5],@K[4],@K[5]
vspltisw $twenty,-12 # synthesize constants
vspltisw $twenty,-12 # synthesize constants
vspltisw $twelve,12
vspltisw $twenty5,-7
#vspltisw $seven,7 # synthesized in the loop

View file

@ -111,7 +111,7 @@ sub D_ENCRYPT
&and( $u, "0xfcfcfcfc" ); # 2
&xor( $tmp1, $tmp1); # 1
&and( $t, "0xcfcfcfcf" ); # 2
&xor( $tmp2, $tmp2);
&xor( $tmp2, $tmp2);
&movb( &LB($tmp1), &LB($u) );
&movb( &LB($tmp2), &HB($u) );
&rotr( $t, 4 );
@ -175,7 +175,7 @@ sub IP_new
&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
if ($lr != 3)
{
if (($lr-3) < 0)

View file

@ -85,7 +85,7 @@ sub DES_encrypt_internal()
&function_end_B("_x86_DES_encrypt");
}
sub DES_decrypt_internal()
{
&function_begin_B("_x86_DES_decrypt");
@ -122,7 +122,7 @@ sub DES_decrypt_internal()
&function_end_B("_x86_DES_decrypt");
}
sub DES_encrypt
{
local($name,$do_ip)=@_;
@ -283,7 +283,7 @@ sub IP_new
&R_PERM_OP($l,$tt,$r,14,"0x33333333",$r);
&R_PERM_OP($tt,$r,$l,22,"0x03fc03fc",$r);
&R_PERM_OP($l,$r,$tt, 9,"0xaaaaaaaa",$r);
if ($lr != 3)
{
if (($lr-3) < 0)

View file

@ -34,7 +34,7 @@ sub DES_encrypt3
&IP_new($L,$R,"edx",0);
# put them back
if ($enc)
{
&mov(&DWP(4,"ebx","",0),$R);

View file

@ -660,7 +660,7 @@ __ecp_nistz256_div_by_2:
adc $ap,xzr,xzr // zap $ap
tst $acc0,#1 // is a even?
csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus
csel $acc1,$acc1,$t1,eq
csel $acc2,$acc2,$t2,eq
csel $acc3,$acc3,$t3,eq

View file

@ -1874,7 +1874,7 @@ $code.=<<___ if ($i<3);
ldx [$bp+8*($i+1)],$bi ! bp[$i+1]
___
$code.=<<___;
addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication
addcc $acc1,$t0,$acc1 ! accumulate high parts of multiplication
sllx $acc0,32,$t0
addxccc $acc2,$t1,$acc2
srlx $acc0,32,$t1

View file

@ -443,7 +443,7 @@ for(1..37) {
&mov (&DWP(20,"esp"),"eax");
&mov (&DWP(24,"esp"),"eax");
&mov (&DWP(28,"esp"),"eax");
&call ("_ecp_nistz256_sub");
&stack_pop(8);

View file

@ -611,7 +611,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc0
########################################################################
# Second reduction step
# Second reduction step
mov $acc1, $t1
shl \$32, $acc1
mulq $poly3
@ -658,7 +658,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc1
########################################################################
# Third reduction step
# Third reduction step
mov $acc2, $t1
shl \$32, $acc2
mulq $poly3
@ -705,7 +705,7 @@ __ecp_nistz256_mul_montq:
adc \$0, $acc2
########################################################################
# Final reduction step
# Final reduction step
mov $acc3, $t1
shl \$32, $acc3
mulq $poly3
@ -718,7 +718,7 @@ __ecp_nistz256_mul_montq:
mov $acc5, $t1
adc \$0, $acc2
########################################################################
########################################################################
# Branch-less conditional subtraction of P
sub \$-1, $acc4 # .Lpoly[0]
mov $acc0, $t2
@ -2118,7 +2118,7 @@ $code.=<<___;
movq %xmm1, $r_ptr
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
___
{
{
######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
# operate in 4-5-6-7 "name space" that matches squaring output
#
@ -2207,7 +2207,7 @@ $code.=<<___;
lea $M(%rsp), $b_ptr
mov $acc4, $acc6 # harmonize sub output and mul input
xor %ecx, %ecx
mov $acc4, $S+8*0(%rsp) # have to save:-(
mov $acc4, $S+8*0(%rsp) # have to save:-(
mov $acc5, $acc2
mov $acc5, $S+8*1(%rsp)
cmovz $acc0, $acc3
@ -3055,8 +3055,8 @@ ___
########################################################################
# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
#
open TABLE,"<ecp_nistz256_table.c" or
open TABLE,"<${dir}../ecp_nistz256_table.c" or
open TABLE,"<ecp_nistz256_table.c" or
open TABLE,"<${dir}../ecp_nistz256_table.c" or
die "failed to open ecp_nistz256_table.c:",$!;
use integer;

View file

@ -57,7 +57,7 @@ sub R0
local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
&mov($tmp1,$C) if $pos < 0;
&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
# body proper

View file

@ -242,7 +242,7 @@ md5_block_asm_data_order:
ldd [%o1 + 0x20], %f16
ldd [%o1 + 0x28], %f18
ldd [%o1 + 0x30], %f20
subcc %o2, 1, %o2 ! done yet?
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x38], %f22
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20

View file

@ -15,7 +15,7 @@
&& !defined(_MIPS_ARCH_MIPS32R2)
# define _MIPS_ARCH_MIPS32R2
# endif
# if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \
defined(_MIPS_ARCH_MIPS64R6)) \
&& !defined(_MIPS_ARCH_MIPS64R2)

View file

@ -54,7 +54,7 @@
#
# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
# ====================================================================
@ -528,7 +528,7 @@ $code.=<<___;
#ifdef __ARMEL__
vrev64.8 $Xl,$Xl
#endif
sub $Xi,#16
sub $Xi,#16
vst1.64 $Xl#hi,[$Xi]! @ write out Xi
vst1.64 $Xl#lo,[$Xi]

View file

@ -158,7 +158,7 @@ $code.=<<___;
lg $Zhi,0+1($Xi)
lghi $tmp,0
.Louter:
xg $Zhi,0($inp) # Xi ^= inp
xg $Zhi,0($inp) # Xi ^= inp
xg $Zlo,8($inp)
xgr $Zhi,$tmp
stg $Zlo,8+1($Xi)

View file

@ -811,7 +811,7 @@ sub mmx_loop() {
&bswap ($dat);
&pshufw ($Zhi,$Zhi,0b00011011); # 76543210
&bswap ("ebx");
&cmp ("ecx",&DWP(528+16+8,"esp")); # are we done?
&jne (&label("outer"));
}
@ -915,7 +915,7 @@ my ($Xhi,$Xi) = @_;
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&psrldq ($T1,8); #
&psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
@ -1085,7 +1085,7 @@ my ($Xhi,$Xi) = @_;
&psllq ($Xi,57); #
&movdqa ($T1,$Xi); #
&pslldq ($Xi,8);
&psrldq ($T1,8); #
&psrldq ($T1,8); #
&pxor ($Xi,$T2);
&pxor ($Xhi,$T1); #
&pshufd ($T1,$Xhn,0b01001110);

View file

@ -468,7 +468,7 @@ $code.=<<___;
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
psrldq \$8,$T1 #
psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
@ -582,7 +582,7 @@ ___
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
# experimental alternative. special thing about is that there
# no dependency between the two multiplications...
# no dependency between the two multiplications...
mov \$`0xE1<<1`,%eax
mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
mov \$0x07,%r11d
@ -757,7 +757,7 @@ $code.=<<___;
movdqa $T2,$T1 #
pslldq \$8,$T2
pclmulqdq \$0x00,$Hkey2,$Xln
psrldq \$8,$T1 #
psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
movdqu 0($inp),$T1
@ -893,7 +893,7 @@ $code.=<<___;
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
psrldq \$8,$T1 #
psrldq \$8,$T1 #
pxor $T2,$Xi
pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #

View file

@ -15,7 +15,7 @@
# des_cblock (*ivec);
# int enc;
#
# calls
# calls
# des_encrypt((DES_LONG *)tin,schedule,DES_ENCRYPT);
#
@ -36,7 +36,7 @@ sub cbc
# name is the function name
# enc_func and dec_func and the functions to call for encrypt/decrypt
# swap is true if byte order needs to be reversed
# iv_off is parameter number for the iv
# iv_off is parameter number for the iv
# enc_off is parameter number for the encrypt/decrypt flag
# p1,p2,p3 are the offsets for parameters to be passed to the
# underlying calls.
@ -114,7 +114,7 @@ sub cbc
#############################################################
&set_label("encrypt_loop");
# encrypt start
# encrypt start
# "eax" and "ebx" hold iv (or the last cipher text)
&mov("ecx", &DWP(0,$in,"",0)); # load first 4 bytes
@ -208,7 +208,7 @@ sub cbc
#############################################################
#############################################################
&set_label("decrypt",1);
# decrypt start
# decrypt start
&and($count,0xfffffff8);
# The next 2 instructions are only for if the jz is taken
&mov("eax", &DWP($data_off+8,"esp","",0)); # get iv[0]
@ -350,7 +350,7 @@ sub cbc
&align(64);
&function_end_B($name);
}
1;

View file

@ -36,7 +36,7 @@ my $globl = sub {
my $ret;
$name =~ s|^\.||;
SWITCH: for ($flavour) {
/aix/ && do { if (!$$type) {
$$type = "\@function";

View file

@ -117,7 +117,7 @@ $::code.=<<___;
brnz,pn $ooff, 2f
sub $len, 1, $len
std %f0, [$out + 0]
std %f2, [$out + 8]
brnz,pt $len, .L${bits}_cbc_enc_loop
@ -224,7 +224,7 @@ $::code.=<<___;
call _${alg}${bits}_encrypt_1x
add $inp, 16, $inp
sub $len, 1, $len
stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
add $out, 8, $out
stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
@ -339,7 +339,7 @@ $::code.=<<___;
brnz,pn $ooff, 2f
sub $len, 1, $len
std %f0, [$out + 0]
std %f2, [$out + 8]
brnz,pt $len, .L${bits}_cbc_dec_loop2x
@ -445,7 +445,7 @@ $::code.=<<___;
brnz,pn $ooff, 2f
sub $len, 2, $len
std %f0, [$out + 0]
std %f2, [$out + 8]
std %f4, [$out + 16]
@ -702,7 +702,7 @@ $::code.=<<___;
brnz,pn $ooff, 2f
sub $len, 1, $len
std %f0, [$out + 0]
std %f2, [$out + 8]
brnz,pt $len, .L${bits}_ctr32_loop2x
@ -791,7 +791,7 @@ $::code.=<<___;
brnz,pn $ooff, 2f
sub $len, 2, $len
std %f0, [$out + 0]
std %f2, [$out + 8]
std %f4, [$out + 16]
@ -1024,7 +1024,7 @@ $code.=<<___;
brnz,pn $ooff, 2f
sub $len, 1, $len
std %f0, [$out + 0]
std %f2, [$out + 8]
brnz,pt $len, .L${bits}_xts_${dir}loop2x
@ -1135,7 +1135,7 @@ $code.=<<___;
brnz,pn $ooff, 2f
sub $len, 2, $len
std %f0, [$out + 0]
std %f2, [$out + 8]
std %f4, [$out + 16]

View file

@ -151,7 +151,7 @@ my %globals;
if ($gas) {
if ($self->{op} eq "movz") { # movz is pain...
sprintf "%s%s%s",$self->{op},$self->{sz},shift;
} elsif ($self->{op} =~ /^set/) {
} elsif ($self->{op} =~ /^set/) {
"$self->{op}";
} elsif ($self->{op} eq "ret") {
my $epilogue = "";
@ -178,7 +178,7 @@ my %globals;
$self->{op} .= $self->{sz};
} elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
$self->{op} = "\tDQ";
}
}
$self->{op};
}
}
@ -639,7 +639,7 @@ my %globals;
if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
{ $var=~s/([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
$var;
};
};
$sz =~ tr/bvlrq/BWDDQ/;
$self->{value} = "\tD$sz\t";
@ -649,7 +649,7 @@ my %globals;
};
/\.byte/ && do { my @str=split(/,\s*/,$$line);
map(s/(0b[0-1]+)/oct($1)/eig,@str);
map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
while ($#str>15) {
$self->{value}.="DB\t"
.join(",",@str[0..15])."\n";
@ -896,7 +896,7 @@ while(defined(my $line=<>)) {
printf "%s",$directive->out();
} elsif (my $opcode=opcode->re(\$line)) {
my $asm = eval("\$".$opcode->mnemonic());
if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
next;
@ -974,7 +974,7 @@ close STDOUT;
# %r13 - -
# %r14 - -
# %r15 - -
#
#
# (*) volatile register
# (-) preserved by callee
# (#) Nth argument, volatile

View file

@ -132,7 +132,7 @@ ___
grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
push (@out,$comm)
}
push (@out,$initseg) if ($initseg);
push (@out,$initseg) if ($initseg);
}
sub ::comment { foreach (@_) { push(@out,"\t; $_\n"); } }

View file

@ -89,7 +89,7 @@ _RC4:
|| NOP 5
STB $XX,*${KEYA}[-2] ; key->x
|| SUB4 $YY,$TX,$YY
|| BNOP B3
|| BNOP B3
STB $YY,*${KEYB}[-1] ; key->y
|| NOP 5
.endasmfunc

View file

@ -51,7 +51,7 @@ my ($rc4,$md5)=(1,1); # what to generate?
my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(),
# but its result is discarded. Idea here is
# to be able to use 'openssl speed rc4' for
# benchmarking the stitched subroutine...
# benchmarking the stitched subroutine...
my $flavour = shift;
my $output = shift;
@ -419,7 +419,7 @@ $code.=<<___ if ($rc4 && (!$md5 || $D));
and \$63,$len # remaining bytes
jnz .Loop1
jmp .Ldone
.align 16
.Loop1:
add $TX[0]#b,$YY#b

View file

@ -98,7 +98,7 @@ sub unrolledloopbody {
for ($i=0;$i<4;$i++) {
$code.=<<___;
ldo 1($XX[0]),$XX[1]
`sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
`sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
and $mask,$XX[1],$XX[1]
$LDX $YY($key),$TY
$MKX $YY,$key,$ix
@ -166,7 +166,7 @@ RC4
ldo `2*$SZ`($key),$key
ldi 0xff,$mask
ldi 3,$dat0
ldi 3,$dat0
ldo 1($XX[0]),$XX[0] ; warm up loop
and $mask,$XX[0],$XX[0]

View file

@ -48,7 +48,7 @@
# April 2005
#
# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
# those with add/sub results in 50% performance improvement of folded
# loop...

View file

@ -34,7 +34,7 @@ $KL2=0x6ED9EBA1;
$KL3=0x8F1BBCDC;
$KL4=0xA953FD4E;
$KR0=0x50A28BE6;
$KR1=0x5C4DD124;
$KR1=0x5C4DD124;
$KR2=0x6D703EF3;
$KR3=0x7A6D76E9;
@ -543,28 +543,28 @@ sub ripemd160_block
# &mov($tmp2, &wparam(0)); # Moved into last round
&mov($tmp1, &DWP( 4,$tmp2,"",0)); # ctx->B
&add($D, $tmp1);
&add($D, $tmp1);
&mov($tmp1, &swtmp(16+2)); # $c
&add($D, $tmp1);
&mov($tmp1, &DWP( 8,$tmp2,"",0)); # ctx->C
&add($E, $tmp1);
&add($E, $tmp1);
&mov($tmp1, &swtmp(16+3)); # $d
&add($E, $tmp1);
&mov($tmp1, &DWP(12,$tmp2,"",0)); # ctx->D
&add($A, $tmp1);
&add($A, $tmp1);
&mov($tmp1, &swtmp(16+4)); # $e
&add($A, $tmp1);
&mov($tmp1, &DWP(16,$tmp2,"",0)); # ctx->E
&add($B, $tmp1);
&add($B, $tmp1);
&mov($tmp1, &swtmp(16+0)); # $a
&add($B, $tmp1);
&mov($tmp1, &DWP( 0,$tmp2,"",0)); # ctx->A
&add($C, $tmp1);
&add($C, $tmp1);
&mov($tmp1, &swtmp(16+1)); # $b
&add($C, $tmp1);

View file

@ -133,7 +133,7 @@ $ymm=1 if ($xmm &&
=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
$1>=2.19); # first version supporting AVX
$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" &&
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
$1>=2.03); # first version supporting AVX

View file

@ -95,7 +95,7 @@ $K="%xmm15";
if (1) {
# Atom-specific optimization aiming to eliminate pshufb with high
# registers [and thus get rid of 48 cycles accumulated penalty]
# registers [and thus get rid of 48 cycles accumulated penalty]
@Xi=map("%xmm$_",(0..4));
($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
@V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
@ -126,7 +126,7 @@ my $k=$i+2;
# ...
# $i==13: 14,15,15,15,
# $i==14: 15
#
#
# Then at $i==15 Xupdate is applied one iteration in advance...
$code.=<<___ if ($i==0);
movd (@ptr[0]),@Xi[0]

View file

@ -227,7 +227,7 @@ sha1_block_data_order:
ldd [%o1 + 0x20], %f16
ldd [%o1 + 0x28], %f18
ldd [%o1 + 0x30], %f20
subcc %o2, 1, %o2 ! done yet?
subcc %o2, 1, %o2 ! done yet?
ldd [%o1 + 0x38], %f22
add %o1, 0x40, %o1
prefetch [%o1 + 63], 20

View file

@ -519,7 +519,7 @@ $code.=<<___;
mov $Cctx,$C
mov $Dctx,$D
mov $Ectx,$E
alignaddr %g0,$tmp0,%g0
alignaddr %g0,$tmp0,%g0
dec 1,$len
ba .Loop
mov $nXfer,$Xfer

View file

@ -262,7 +262,7 @@ sha1_block_data_order:
jz .Lialu
___
$code.=<<___ if ($shaext);
test \$`1<<29`,%r10d # check SHA bit
test \$`1<<29`,%r10d # check SHA bit
jnz _shaext_shortcut
___
$code.=<<___ if ($avx>1);

View file

@ -47,7 +47,7 @@
#
# Performance in clock cycles per processed byte (less is better):
#
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
# gcc icc x86 asm(*) SIMD x86_64 asm(**)
# Pentium 46 57 40/38 - -
# PIII 36 33 27/24 - -
# P4 41 38 28 - 17.3
@ -276,7 +276,7 @@ my $suffix=shift;
&mov ($Coff,"ecx");
&mov ($Doff,"edi");
&mov (&DWP(0,"esp"),"ebx"); # magic
&mov ($E,&DWP(16,"esi"));
&mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("edi",&DWP(28,"esi"));
@ -385,7 +385,7 @@ my @AH=($A,$K256);
&xor ($AH[1],"ecx"); # magic
&mov (&DWP(8,"esp"),"ecx");
&mov (&DWP(12,"esp"),"ebx");
&mov ($E,&DWP(16,"esi"));
&mov ($E,&DWP(16,"esi"));
&mov ("ebx",&DWP(20,"esi"));
&mov ("ecx",&DWP(24,"esi"));
&mov ("esi",&DWP(28,"esi"));

View file

@ -36,7 +36,7 @@
# (iii) "this" is for n=8, when we gather twice as much data, result
# for n=4 is 20.3+4.44=24.7;
# (iv) presented improvement coefficients are asymptotic limits and
# in real-life application are somewhat lower, e.g. for 2KB
# in real-life application are somewhat lower, e.g. for 2KB
# fragments they range from 75% to 130% (on Haswell);
$flavour = shift;

View file

@ -383,7 +383,7 @@ if ($sse2) {
&set_label("16_79_sse2",16);
for ($j=0;$j<2;$j++) { # 2x unroll
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
#&movq ("mm7",&QWP(8*(9+16-1),"esp")); # prefetched in BODY_00_15
&movq ("mm5",&QWP(8*(9+16-14),"esp"));
&movq ("mm1","mm7");
&psrlq ("mm7",1);

View file

@ -26,7 +26,7 @@
# Denver 2.01 10.5 (+26%) 6.70 (+8%)
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
#
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by

View file

@ -368,7 +368,7 @@ L\$parisc1
___
@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
$Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
$Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
$a0 ="%r17";
@ -419,7 +419,7 @@ $code.=<<___;
add $t0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[0],$t0
addc $t1,$hhi,$hhi ; h += Sigma1(e)
shd $alo,$ahi,$Sigma0[0],$t1
shd $alo,$ahi,$Sigma0[0],$t1
add $a0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[1],$t2
addc $a1,$hhi,$hhi ; h += Ch(e,f,g)

View file

@ -311,7 +311,7 @@ $code.=<<___;
cl${g} $inp,`$frame+4*$SIZE_T`($sp)
jne .Lloop
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
br %r14
.size $Func,.-$Func
.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"

View file

@ -102,7 +102,7 @@ if ($output =~ /512/) {
$locals=0; # X[16] is register resident
@X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
$A="%l0";
$B="%l1";
$C="%l2";
@ -254,7 +254,7 @@ $code.=<<___;
$SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
xor $tmp0,$h,$h
$SRL $a,@Sigma0[2],$tmp0
xor $tmp1,$h,$h
xor $tmp1,$h,$h
$SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
xor $tmp0,$h,$h
xor $tmp1,$h,$h ! Sigma0(a)

View file

@ -1782,7 +1782,7 @@ if ($avx>1) {{
######################################################################
# AVX2+BMI code path
#
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
my $PUSH8=8*2*$SZ;
use integer;

View file

@ -480,7 +480,7 @@ static char *ts_get_status_text(STACK_OF(ASN1_UTF8STRING) *text)
return result;
}
static int ts_check_policy(const ASN1_OBJECT *req_oid,
static int ts_check_policy(const ASN1_OBJECT *req_oid,
const TS_TST_INFO *tst_info)
{
const ASN1_OBJECT *resp_oid = tst_info->policy_id;

View file

@ -31,7 +31,7 @@
# multiplying 64 by CPU clock frequency and dividing by relevant
# value from the given table:
#
# $SCALE=2/8 icc8 gcc3
# $SCALE=2/8 icc8 gcc3
# Intel P4 3200/4600 4600(*) 6400
# Intel PIII 2900/3000 4900 5400
# AMD K[78] 2500/1800 9900 8200(**)
@ -502,6 +502,6 @@ for($i=0;$i<8;$i++) {
&L(0xca,0x2d,0xbf,0x07,0xad,0x5a,0x83,0x33);
&function_end_B("whirlpool_block_mmx");
&asm_finish();
&asm_finish();
close STDOUT;

View file

@ -38,7 +38,7 @@ const X509V3_EXT_METHOD v3_crl_reason = {
crl_reasons
};
char *i2s_ASN1_ENUMERATED_TABLE(X509V3_EXT_METHOD *method,
char *i2s_ASN1_ENUMERATED_TABLE(X509V3_EXT_METHOD *method,
const ASN1_ENUMERATED *e)
{
ENUMERATED_NAMES *enam;

View file

@ -24,7 +24,7 @@ const X509V3_EXT_METHOD v3_skey_id = {
NULL
};
char *i2s_ASN1_OCTET_STRING(X509V3_EXT_METHOD *method,
char *i2s_ASN1_OCTET_STRING(X509V3_EXT_METHOD *method,
const ASN1_OCTET_STRING *oct)
{
return OPENSSL_buf2hexstr(oct->data, oct->length);

View file

@ -89,7 +89,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
&ja (&label("generic"));
&and ("edx",0xefffffff); # clear hyper-threading bit
&jmp (&label("generic"));
&set_label("intel");
&cmp ("edi",7);
&jb (&label("cacheinfo"));

View file

@ -535,7 +535,7 @@ $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
sub $len,%rsp
shr \$3,$len
lea (%rsp),$out
.byte 0xf3,0x48,0xa5 # rep movsq
.byte 0xf3,0x48,0xa5 # rep movsq
lea (%r8),$out
lea (%rsp),$inp
mov $chunk,$len

View file

@ -805,7 +805,7 @@ X509_NAME_ENTRY *X509_NAME_ENTRY_create_by_txt(X509_NAME_ENTRY **ne,
const unsigned char *bytes,
int len);
X509_NAME_ENTRY *X509_NAME_ENTRY_create_by_NID(X509_NAME_ENTRY **ne, int nid,
int type,
int type,
const unsigned char *bytes,
int len);
int X509_NAME_add_entry_by_txt(X509_NAME *name, const char *field, int type,

View file

@ -178,7 +178,7 @@ static int wpacket_intern_close(WPACKET *pkt)
}
/* Write out the WPACKET length if needed */
if (sub->lenbytes > 0
if (sub->lenbytes > 0
&& !put_value((unsigned char *)&pkt->buf->data[sub->packet_len],
packlen, sub->lenbytes))
return 0;

View file

@ -707,7 +707,7 @@ int WPACKET_sub_allocate_bytes__(WPACKET *pkt, size_t len,
* maximum size will be. If this function is used, then it should be immediately
* followed by a WPACKET_allocate_bytes() call before any other WPACKET
* functions are called (unless the write to the allocated bytes is abandoned).
*
*
* For example: If we are generating a signature, then the size of that
* signature may not be known in advance. We can use WPACKET_reserve_bytes() to
* handle this:

View file

@ -6,7 +6,7 @@
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# Perl utility to run PKITS tests for RFC3280 compliance.
# Perl utility to run PKITS tests for RFC3280 compliance.
my $ossl_path;

View file

@ -23,7 +23,7 @@ my %conversionforms = (
sub tconversion {
my $testtype = shift;
my $t = shift;
my @conversionforms =
my @conversionforms =
defined($conversionforms{$testtype}) ?
@{$conversionforms{$testtype}} :
@{$conversionforms{"*"}};

View file

@ -115,7 +115,7 @@ static int test_WPACKET_set_max_size(void)
|| !WPACKET_set_max_size(&pkt, SIZE_MAX)
|| !WPACKET_finish(&pkt)) {
testfail("test_WPACKET_set_max_size():1 failed\n", &pkt);
return 0;
return 0;
}
if (!WPACKET_init_len(&pkt, buf, 1)

View file

@ -8,7 +8,7 @@
# This is just a quick script to scan for cases where the 'error'
# function name in a XXXerr() macro is wrong.
#
#
# Run in the top level by going
# perl util/ck_errf.pl */*.c */*/*.c
#

View file

@ -40,7 +40,7 @@ if ($fnum <= 1)
}
$dest = pop @filelist;
if ($fnum > 2 && ! -d $dest)
{
die "Destination must be a directory";
@ -73,5 +73,5 @@ foreach (@filelist)
close(OUT);
print "Copying: $_ to $dfile\n";
}

View file

@ -109,7 +109,7 @@ sub check_hash
$hashval =~ s/^.*=\s+//;
die "Invalid hash syntax in file" if (length($hashfile) != 40);
die "Invalid hash received for file" if (length($hashval) != 40);
die "***HASH VALUE MISMATCH FOR FILE $filename ***" if ($hashval ne $hashfile);
die "***HASH VALUE MISMATCH FOR FILE $filename ***" if ($hashval ne $hashfile);
}

View file

@ -24,7 +24,7 @@
# existence:platform:kind:algorithms
#
# - "existence" can be "EXIST" or "NOEXIST" depending on if the symbol is
# found somewhere in the source,
# found somewhere in the source,
# - "platforms" is empty if it exists on all platforms, otherwise it contains
# comma-separated list of the platform, just as they are if the symbol exists
# for those platforms, or prepended with a "!" if not. This helps resolve
@ -172,7 +172,7 @@ foreach (@ARGV, split(/ /, $config{options}))
$do_ssl=1 if $_ eq "libssl";
if ($_ eq "ssl") {
$do_ssl=1;
$do_ssl=1;
$libname=$_
}
$do_crypto=1 if $_ eq "libcrypto";
@ -211,7 +211,7 @@ foreach (@ARGV, split(/ /, $config{options}))
}
if (!$libname) {
if (!$libname) {
if ($do_ssl) {
$libname="LIBSSL";
}
@ -339,7 +339,7 @@ if($do_crypto == 1) {
}
&update_numbers(*OUT,"LIBCRYPTO",*crypto_list,$max_crypto,@crypto_symbols);
close OUT;
}
}
} elsif ($do_checkexist) {
&check_existing(*ssl_list, @ssl_symbols)

View file

@ -97,7 +97,7 @@ Options:
Default: keep previously assigned numbers. (You are warned
when collisions are detected.)
-nostatic Generates a different source code, where these additional
-nostatic Generates a different source code, where these additional
functions are generated for each library specified in the
config file:
void ERR_load_<LIB>_strings(void);
@ -105,7 +105,7 @@ Options:
void ERR_<LIB>_error(int f, int r, char *fn, int ln);
#define <LIB>err(f,r) ERR_<LIB>_error(f,r,OPENSSL_FILE,OPENSSL_LINE)
while the code facilitates the use of these in an environment
where the error support routines are dynamically loaded at
where the error support routines are dynamically loaded at
runtime.
Default: 'static' code generation.
@ -114,8 +114,8 @@ Options:
-unref Print out unreferenced function and reason codes.
-write Actually (over)write the generated code to the header and C
source files as assigned to each library through the config
-write Actually (over)write the generated code to the header and C
source files as assigned to each library through the config
file.
Default: don't write.
@ -196,7 +196,7 @@ while (($hdr, $lib) = each %libinc)
if(/\/\*/) {
if (not /\*\//) { # multiline comment...
$line = $_; # ... just accumulate
next;
next;
} else {
s/\/\*.*?\*\///gs; # wipe it
}
@ -370,7 +370,7 @@ foreach $file (@source) {
print STDERR "ERROR: mismatch $file:$linenr $func:$3\n";
$errcount++;
}
print STDERR "Function: $1\t= $fcodes{$1} (lib: $2, name: $3)\n" if $debug;
print STDERR "Function: $1\t= $fcodes{$1} (lib: $2, name: $3)\n" if $debug;
}
if(/(([A-Z0-9]+)_R_[A-Z0-9_]+)/) {
next unless exists $csrc{$2};
@ -379,8 +379,8 @@ foreach $file (@source) {
$rcodes{$1} = "X";
$rnew{$2}++;
}
print STDERR "Reason: $1\t= $rcodes{$1} (lib: $2)\n" if $debug;
}
print STDERR "Reason: $1\t= $rcodes{$1} (lib: $2)\n" if $debug;
}
}
close IN;
}

View file

@ -108,7 +108,7 @@ sub structureData {
if($inbrace) {
if($item eq "}") {
$inbrace --;
if(!$inbrace) {
$substruc = structureData($dataitem);
$dataitem = $substruc;