Profiling revealed that OPENSSL_cleanse consumes *more* CPU time than
sha1_block_data_order when hashing short messages. Move OPENSSL_cleanse to "cpuid" assembler module and gain 2x.
This commit is contained in:
parent
932cc129ee
commit
b2dba9bf1f
7 changed files with 158 additions and 4 deletions
|
@ -1209,6 +1209,7 @@ $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /bn86/);
|
|||
|
||||
$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont|mo86\-/);
|
||||
|
||||
$cpuid_obj="mem_clr.o" unless ($cpuid_obj =~ /\.o$/);
|
||||
$des_obj=$des_enc unless ($des_obj =~ /\.o$/);
|
||||
$bf_obj=$bf_enc unless ($bf_obj =~ /\.o$/);
|
||||
$cast_obj=$cast_enc unless ($cast_obj =~ /\.o$/);
|
||||
|
@ -1481,7 +1482,7 @@ print OUT "#ifdef OPENSSL_ALGORITHM_DEFINES\n";
|
|||
print OUT $openssl_algorithm_defines_trans;
|
||||
print OUT "#endif\n\n";
|
||||
|
||||
print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj);
|
||||
print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj ne "mem_clr.o");
|
||||
|
||||
while (<IN>)
|
||||
{
|
||||
|
|
|
@ -34,7 +34,7 @@ GENERAL=Makefile README crypto-lib.com install.com
|
|||
LIB= $(TOP)/libcrypto.a
|
||||
SHARED_LIB= libcrypto$(SHLIB_EXT)
|
||||
LIBSRC= cryptlib.c mem.c mem_clr.c mem_dbg.c cversion.c ex_data.c cpt_err.c ebcdic.c uid.c o_time.c o_str.c o_dir.c
|
||||
LIBOBJ= cryptlib.o mem.o mem_clr.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
|
||||
LIBOBJ= cryptlib.o mem.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
|
||||
|
||||
SRC= $(LIBSRC)
|
||||
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
// Works on all IA-64 platforms: Linux, HP-UX, Win64i...
|
||||
// On Win64i compile with ias.exe.
|
||||
.text
|
||||
|
||||
.global OPENSSL_cpuid_setup#
|
||||
.proc OPENSSL_cpuid_setup#
|
||||
OPENSSL_cpuid_setup:
|
||||
{ .mib; br.ret.sptk.many b0 };;
|
||||
.endp OPENSSL_cpuid_setup#
|
||||
|
||||
.global OPENSSL_rdtsc#
|
||||
.proc OPENSSL_rdtsc#
|
||||
OPENSSL_rdtsc:
|
||||
|
@ -124,3 +126,37 @@ OPENSSL_wipe_cpu:
|
|||
mov ar.lc=r3
|
||||
br.ret.sptk b0 };;
|
||||
.endp OPENSSL_wipe_cpu#
|
||||
|
||||
.global OPENSSL_cleanse#
|
||||
.proc OPENSSL_cleanse#
|
||||
OPENSSL_cleanse:
|
||||
{ .mib; and r2=7,r32
|
||||
cmp.leu p6,p0=15,r33 // len>=15
|
||||
(p6) br.cond.dptk .Lot };;
|
||||
|
||||
.Little:
|
||||
{ .mib; st1 [r32]=r0,1
|
||||
cmp.ltu p6,p7=1,r33 } // len>1
|
||||
{ .mbb; add r33=-1,r33 // len--
|
||||
(p6) br.cond.dptk .Little
|
||||
(p7) br.ret.sptk.many b0 };;
|
||||
|
||||
.Lot:
|
||||
{ .mib; cmp.eq p6,p0=0,r2
|
||||
(p6) br.cond.dptk .Laligned };;
|
||||
{ .mmi; st1 [r32]=r0,1;;
|
||||
and r2=7,r32 }
|
||||
{ .mib; add r33=-1,r33
|
||||
br .Lot };;
|
||||
|
||||
.Laligned:
|
||||
{ .mmi; st8 [r32]=r0,8
|
||||
and r2=-8,r33 // len&~7
|
||||
add r33=-8,r33 };; // len-=8
|
||||
{ .mib; cmp.ltu p6,p0=8,r2 // ((len+8)&~7)>8
|
||||
(p6) br.cond.dptk .Laligned };;
|
||||
|
||||
{ .mbb; cmp.eq p6,p7=r0,r33
|
||||
(p7) br.cond.dpnt .Little
|
||||
(p6) br.ret.sptk.many b0 };;
|
||||
.endp OPENSSL_cleanse#
|
||||
|
|
10
crypto/mem.c
10
crypto/mem.c
|
@ -250,7 +250,6 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
|
|||
void *CRYPTO_malloc_locked(int num, const char *file, int line)
|
||||
{
|
||||
void *ret = NULL;
|
||||
extern unsigned char cleanse_ctr;
|
||||
|
||||
if (num <= 0) return NULL;
|
||||
|
||||
|
@ -267,11 +266,15 @@ void *CRYPTO_malloc_locked(int num, const char *file, int line)
|
|||
if (malloc_debug_func != NULL)
|
||||
malloc_debug_func(ret, num, file, line, 1);
|
||||
|
||||
#ifndef OPENSSL_CPUID_OBJ
|
||||
/* Create a dependency on the value of 'cleanse_ctr' so our memory
|
||||
* sanitisation function can't be optimised out. NB: We only do
|
||||
* this for >2Kb so the overhead doesn't bother us. */
|
||||
if(ret && (num > 2048))
|
||||
{ extern unsigned char cleanse_ctr;
|
||||
((unsigned char *)ret)[0] = cleanse_ctr;
|
||||
}
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -291,7 +294,6 @@ void CRYPTO_free_locked(void *str)
|
|||
void *CRYPTO_malloc(int num, const char *file, int line)
|
||||
{
|
||||
void *ret = NULL;
|
||||
extern unsigned char cleanse_ctr;
|
||||
|
||||
if (num <= 0) return NULL;
|
||||
|
||||
|
@ -308,11 +310,15 @@ void *CRYPTO_malloc(int num, const char *file, int line)
|
|||
if (malloc_debug_func != NULL)
|
||||
malloc_debug_func(ret, num, file, line, 1);
|
||||
|
||||
#ifndef OPENSSL_CPUID_OBJ
|
||||
/* Create a dependency on the value of 'cleanse_ctr' so our memory
|
||||
* sanitisation function can't be optimised out. NB: We only do
|
||||
* this for >2Kb so the overhead doesn't bother us. */
|
||||
if(ret && (num > 2048))
|
||||
{ extern unsigned char cleanse_ctr;
|
||||
((unsigned char *)ret)[0] = cleanse_ctr;
|
||||
}
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -232,6 +232,54 @@ _sparcv9_rdtick:
|
|||
.type _sparcv9_rdtick,#function
|
||||
.size _sparcv9_rdtick,.-_sparcv9_rdtick
|
||||
|
||||
.global OPENSSL_cleanse
|
||||
.align 32
|
||||
OPENSSL_cleanse:
|
||||
cmp %o1,6
|
||||
nop
|
||||
#ifdef ABI64
|
||||
bgu %xcc,.Lot
|
||||
#else
|
||||
bgu .Lot
|
||||
#endif
|
||||
nop
|
||||
|
||||
.Little:
|
||||
stb %g0,[%o0]
|
||||
subcc %o1,1,%o1
|
||||
bnz .Little
|
||||
add %o0,1,%o0
|
||||
retl
|
||||
nop
|
||||
.align 32
|
||||
.Lot:
|
||||
andcc %o0,3,%g0
|
||||
bz .Laligned
|
||||
nop
|
||||
stb %g0,[%o0]
|
||||
sub %o1,1,%o1
|
||||
ba .Lot
|
||||
add %o0,1,%o0
|
||||
nop
|
||||
.Laligned:
|
||||
st %g0,[%o0]
|
||||
sub %o1,4,%o1
|
||||
andcc %o1,-4,%g0
|
||||
#ifdef ABI64
|
||||
bnz %xcc,.Laligned
|
||||
#else
|
||||
bnz .Laligned
|
||||
#endif
|
||||
add %o0,4,%o0
|
||||
|
||||
cmp %o1,0
|
||||
bne .Little
|
||||
nop
|
||||
retl
|
||||
nop
|
||||
.type OPENSSL_cleanse,#function
|
||||
.size OPENSSL_cleanse,.-OPENSSL_cleanse
|
||||
|
||||
.section ".init",#alloc,#execinstr
|
||||
call OPENSSL_cpuid_setup
|
||||
nop
|
||||
|
|
|
@ -155,4 +155,36 @@ OPENSSL_ia32_cpuid:
|
|||
or %rcx,%rax
|
||||
ret
|
||||
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
|
||||
|
||||
.globl OPENSSL_cleanse
|
||||
.type OPENSSL_cleanse,\@function,2
|
||||
.align 16
|
||||
OPENSSL_cleanse:
|
||||
xor %rax,%rax
|
||||
cmp \$15,%rsi
|
||||
jae .Lot
|
||||
.Little:
|
||||
mov %al,(%rdi)
|
||||
sub \$1,%rsi
|
||||
lea 1(%rdi),%rdi
|
||||
jnz .Little
|
||||
ret
|
||||
.align 16
|
||||
.Lot:
|
||||
test \$7,%rdi
|
||||
jz .Laligned
|
||||
mov %al,(%rdi)
|
||||
lea -1(%rsi),%rsi
|
||||
lea 1(%rdi),%rdi
|
||||
jmp .Lot
|
||||
.Laligned:
|
||||
mov %rax,(%rdi)
|
||||
lea -8(%rsi),%rsi
|
||||
test \$-8,%rsi
|
||||
lea 8(%rdi),%rdi
|
||||
jnz .Laligned
|
||||
cmp \$0,%rsi
|
||||
jne .Little
|
||||
ret
|
||||
.size OPENSSL_cleanse,.-OPENSSL_cleanse
|
||||
___
|
||||
|
|
|
@ -216,6 +216,37 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
|||
}
|
||||
&function_end_B("OPENSSL_indirect_call");
|
||||
|
||||
&function_begin_B("OPENSSL_cleanse");
|
||||
&mov ("edx",&wparam(0));
|
||||
&mov ("ecx",&wparam(1));
|
||||
&xor ("eax","eax");
|
||||
&cmp ("ecx",7);
|
||||
&jae (&label("lot"));
|
||||
&set_label("little");
|
||||
&mov (&BP(0,"edx"),"al");
|
||||
&sub ("ecx",1);
|
||||
&lea ("edx",&DWP(1,"edx"));
|
||||
&jnz (&label("little"));
|
||||
&ret ();
|
||||
|
||||
&set_label("lot",16);
|
||||
&test ("edx",3);
|
||||
&jz (&label("aligned"));
|
||||
&mov (&BP(0,"edx"),"al");
|
||||
&lea ("ecx",&DWP(-1,"ecx"));
|
||||
&lea ("edx",&DWP(1,"edx"));
|
||||
&jmp (&label("lot"));
|
||||
&set_label("aligned");
|
||||
&mov (&DWP(0,"edx"),"eax");
|
||||
&lea ("ecx",&DWP(-4,"ecx"));
|
||||
&test ("ecx",-4);
|
||||
&lea ("edx",&DWP(4,"edx"));
|
||||
&jnz (&label("aligned"));
|
||||
&cmp ("ecx",0);
|
||||
&jne (&label("little"));
|
||||
&ret ();
|
||||
&function_end_B("OPENSSL_cleanse");
|
||||
|
||||
&initseg("OPENSSL_cpuid_setup");
|
||||
|
||||
&asm_finish();
|
||||
|
|
Loading…
Reference in a new issue