Profiling revealed that OPENSSL_cleanse consumes *more* CPU time than

sha1_block_data_order when hashing short messages. Move OPENSSL_cleanse
to "cpuid" assembler module and gain 2x.
This commit is contained in:
Andy Polyakov 2007-05-14 21:35:25 +00:00
parent 932cc129ee
commit b2dba9bf1f
7 changed files with 158 additions and 4 deletions

View file

@ -1209,6 +1209,7 @@ $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /bn86/);
$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont|mo86\-/);
$cpuid_obj="mem_clr.o" unless ($cpuid_obj =~ /\.o$/);
$des_obj=$des_enc unless ($des_obj =~ /\.o$/);
$bf_obj=$bf_enc unless ($bf_obj =~ /\.o$/);
$cast_obj=$cast_enc unless ($cast_obj =~ /\.o$/);
@ -1481,7 +1482,7 @@ print OUT "#ifdef OPENSSL_ALGORITHM_DEFINES\n";
print OUT $openssl_algorithm_defines_trans;
print OUT "#endif\n\n";
print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj);
print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj ne "mem_clr.o");
while (<IN>)
{

View file

@ -34,7 +34,7 @@ GENERAL=Makefile README crypto-lib.com install.com
LIB= $(TOP)/libcrypto.a
SHARED_LIB= libcrypto$(SHLIB_EXT)
LIBSRC= cryptlib.c mem.c mem_clr.c mem_dbg.c cversion.c ex_data.c cpt_err.c ebcdic.c uid.c o_time.c o_str.c o_dir.c
LIBOBJ= cryptlib.o mem.o mem_clr.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
LIBOBJ= cryptlib.o mem.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
SRC= $(LIBSRC)

View file

@ -1,11 +1,13 @@
// Works on all IA-64 platforms: Linux, HP-UX, Win64i...
// On Win64i compile with ias.exe.
.text
.global OPENSSL_cpuid_setup#
.proc OPENSSL_cpuid_setup#
OPENSSL_cpuid_setup:
{ .mib; br.ret.sptk.many b0 };;
.endp OPENSSL_cpuid_setup#
.global OPENSSL_rdtsc#
.proc OPENSSL_rdtsc#
OPENSSL_rdtsc:
@ -124,3 +126,37 @@ OPENSSL_wipe_cpu:
mov ar.lc=r3
br.ret.sptk b0 };;
.endp OPENSSL_wipe_cpu#
.global OPENSSL_cleanse#
.proc OPENSSL_cleanse#
OPENSSL_cleanse:
{ .mib; and r2=7,r32
cmp.leu p6,p0=15,r33 // len>=15
(p6) br.cond.dptk .Lot };;
.Little:
{ .mib; st1 [r32]=r0,1
cmp.ltu p6,p7=1,r33 } // len>1
{ .mbb; add r33=-1,r33 // len--
(p6) br.cond.dptk .Little
(p7) br.ret.sptk.many b0 };;
.Lot:
{ .mib; cmp.eq p6,p0=0,r2
(p6) br.cond.dptk .Laligned };;
{ .mmi; st1 [r32]=r0,1;;
and r2=7,r32 }
{ .mib; add r33=-1,r33
br .Lot };;
.Laligned:
{ .mmi; st8 [r32]=r0,8
and r2=-8,r33 // len&~7
add r33=-8,r33 };; // len-=8
{ .mib; cmp.ltu p6,p0=8,r2 // ((len+8)&~7)>8
(p6) br.cond.dptk .Laligned };;
{ .mbb; cmp.eq p6,p7=r0,r33
(p7) br.cond.dpnt .Little
(p6) br.ret.sptk.many b0 };;
.endp OPENSSL_cleanse#

View file

@ -250,7 +250,6 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
void *CRYPTO_malloc_locked(int num, const char *file, int line)
{
void *ret = NULL;
extern unsigned char cleanse_ctr;
if (num <= 0) return NULL;
@ -267,11 +266,15 @@ void *CRYPTO_malloc_locked(int num, const char *file, int line)
if (malloc_debug_func != NULL)
malloc_debug_func(ret, num, file, line, 1);
#ifndef OPENSSL_CPUID_OBJ
/* Create a dependency on the value of 'cleanse_ctr' so our memory
* sanitisation function can't be optimised out. NB: We only do
* this for >2Kb so the overhead doesn't bother us. */
if(ret && (num > 2048))
{ extern unsigned char cleanse_ctr;
((unsigned char *)ret)[0] = cleanse_ctr;
}
#endif
return ret;
}
@ -291,7 +294,6 @@ void CRYPTO_free_locked(void *str)
void *CRYPTO_malloc(int num, const char *file, int line)
{
void *ret = NULL;
extern unsigned char cleanse_ctr;
if (num <= 0) return NULL;
@ -308,11 +310,15 @@ void *CRYPTO_malloc(int num, const char *file, int line)
if (malloc_debug_func != NULL)
malloc_debug_func(ret, num, file, line, 1);
#ifndef OPENSSL_CPUID_OBJ
/* Create a dependency on the value of 'cleanse_ctr' so our memory
* sanitisation function can't be optimised out. NB: We only do
* this for >2Kb so the overhead doesn't bother us. */
if(ret && (num > 2048))
{ extern unsigned char cleanse_ctr;
((unsigned char *)ret)[0] = cleanse_ctr;
}
#endif
return ret;
}

View file

@ -232,6 +232,54 @@ _sparcv9_rdtick:
.type _sparcv9_rdtick,#function
.size _sparcv9_rdtick,.-_sparcv9_rdtick
.global OPENSSL_cleanse
.align 32
OPENSSL_cleanse:
cmp %o1,6
nop
#ifdef ABI64
bgu %xcc,.Lot
#else
bgu .Lot
#endif
nop
.Little:
stb %g0,[%o0]
subcc %o1,1,%o1
bnz .Little
add %o0,1,%o0
retl
nop
.align 32
.Lot:
andcc %o0,3,%g0
bz .Laligned
nop
stb %g0,[%o0]
sub %o1,1,%o1
ba .Lot
add %o0,1,%o0
nop
.Laligned:
st %g0,[%o0]
sub %o1,4,%o1
andcc %o1,-4,%g0
#ifdef ABI64
bnz %xcc,.Laligned
#else
bnz .Laligned
#endif
add %o0,4,%o0
cmp %o1,0
bne .Little
nop
retl
nop
.type OPENSSL_cleanse,#function
.size OPENSSL_cleanse,.-OPENSSL_cleanse
.section ".init",#alloc,#execinstr
call OPENSSL_cpuid_setup
nop

View file

@ -155,4 +155,36 @@ OPENSSL_ia32_cpuid:
or %rcx,%rax
ret
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
.globl OPENSSL_cleanse
.type OPENSSL_cleanse,\@function,2
.align 16
OPENSSL_cleanse:
xor %rax,%rax
cmp \$15,%rsi
jae .Lot
.Little:
mov %al,(%rdi)
sub \$1,%rsi
lea 1(%rdi),%rdi
jnz .Little
ret
.align 16
.Lot:
test \$7,%rdi
jz .Laligned
mov %al,(%rdi)
lea -1(%rsi),%rsi
lea 1(%rdi),%rdi
jmp .Lot
.Laligned:
mov %rax,(%rdi)
lea -8(%rsi),%rsi
test \$-8,%rsi
lea 8(%rdi),%rdi
jnz .Laligned
cmp \$0,%rsi
jne .Little
ret
.size OPENSSL_cleanse,.-OPENSSL_cleanse
___

View file

@ -216,6 +216,37 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
}
&function_end_B("OPENSSL_indirect_call");
&function_begin_B("OPENSSL_cleanse");
&mov ("edx",&wparam(0));
&mov ("ecx",&wparam(1));
&xor ("eax","eax");
&cmp ("ecx",7);
&jae (&label("lot"));
&set_label("little");
&mov (&BP(0,"edx"),"al");
&sub ("ecx",1);
&lea ("edx",&DWP(1,"edx"));
&jnz (&label("little"));
&ret ();
&set_label("lot",16);
&test ("edx",3);
&jz (&label("aligned"));
&mov (&BP(0,"edx"),"al");
&lea ("ecx",&DWP(-1,"ecx"));
&lea ("edx",&DWP(1,"edx"));
&jmp (&label("lot"));
&set_label("aligned");
&mov (&DWP(0,"edx"),"eax");
&lea ("ecx",&DWP(-4,"ecx"));
&test ("ecx",-4);
&lea ("edx",&DWP(4,"edx"));
&jnz (&label("aligned"));
&cmp ("ecx",0);
&jne (&label("little"));
&ret ();
&function_end_B("OPENSSL_cleanse");
&initseg("OPENSSL_cpuid_setup");
&asm_finish();