ARM assembly pack: get ARMv7 instruction endianness right.
Pointer out and suggested by: Ard Biesheuvel.
This commit is contained in:
parent
cd91fd7c32
commit
5dcf70a1c5
8 changed files with 89 additions and 47 deletions
|
@ -715,8 +715,8 @@ _armv4_AES_set_encrypt_key:
|
|||
.Ldone: mov r0,#0
|
||||
ldmia sp!,{r4-r12,lr}
|
||||
.Labrt:
|
||||
#if defined(__thumb2__) && __ARM_ARCH__>=7
|
||||
.short 0x4770 @ bx lr in Thumb2 encoding
|
||||
#if __ARM_ARCH__>=5
|
||||
ret @ bx lr
|
||||
#else
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
|
@ -1203,6 +1203,7 @@ _armv4_AES_decrypt:
|
|||
___
|
||||
|
||||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||||
$code =~ s/\bret\b/bx\tlr/gm;
|
||||
|
||||
open SELF,$0;
|
||||
while(<SELF>) {
|
||||
|
|
|
@ -7,42 +7,46 @@
|
|||
.global _armv7_neon_probe
|
||||
.type _armv7_neon_probe,%function
|
||||
_armv7_neon_probe:
|
||||
.word 0xf26ee1fe @ vorr q15,q15,q15
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.byte 0xf0,0x01,0x60,0xf2 @ vorr q8,q8,q8
|
||||
.byte 0x1e,0xff,0x2f,0xe1 @ bx lr
|
||||
.size _armv7_neon_probe,.-_armv7_neon_probe
|
||||
|
||||
.global _armv7_tick
|
||||
.type _armv7_tick,%function
|
||||
_armv7_tick:
|
||||
mrrc p15,1,r0,r1,c14 @ CNTVCT
|
||||
.word 0xe12fff1e @ bx lr
|
||||
mrrc p15,1,r0,r1,c14 @ CNTVCT
|
||||
#if __ARM_ARCH__>=5
|
||||
bx lr
|
||||
#else
|
||||
.word 0xe12fff1e @ bx lr
|
||||
#endif
|
||||
.size _armv7_tick,.-_armv7_tick
|
||||
|
||||
.global _armv8_aes_probe
|
||||
.type _armv8_aes_probe,%function
|
||||
_armv8_aes_probe:
|
||||
.word 0xf3b00300 @ aese.8 q0,q0
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.byte 0x00,0x03,0xb0,0xf3 @ aese.8 q0,q0
|
||||
.byte 0x1e,0xff,0x2f,0xe1 @ bx lr
|
||||
.size _armv8_aes_probe,.-_armv8_aes_probe
|
||||
|
||||
.global _armv8_sha1_probe
|
||||
.type _armv8_sha1_probe,%function
|
||||
_armv8_sha1_probe:
|
||||
.word 0xf2000c40 @ sha1c.32 q0,q0,q0
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.byte 0x40,0x0c,0x00,0xf2 @ sha1c.32 q0,q0,q0
|
||||
.byte 0x1e,0xff,0x2f,0xe1 @ bx lr
|
||||
.size _armv8_sha1_probe,.-_armv8_sha1_probe
|
||||
|
||||
.global _armv8_sha256_probe
|
||||
.type _armv8_sha256_probe,%function
|
||||
_armv8_sha256_probe:
|
||||
.word 0xf3000c40 @ sha256h.32 q0,q0,q0
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.byte 0x40,0x0c,0x00,0xf3 @ sha256h.32 q0,q0,q0
|
||||
.byte 0x1e,0xff,0x2f,0xe1 @ bx lr
|
||||
.size _armv8_sha256_probe,.-_armv8_sha256_probe
|
||||
.global _armv8_pmull_probe
|
||||
.type _armv8_pmull_probe,%function
|
||||
_armv8_pmull_probe:
|
||||
.word 0xf2a00e00 @ vmull.p64 q0,d0,d0
|
||||
.word 0xe12fff1e @ bx lr
|
||||
.byte 0x00,0x0e,0xa0,0xf2 @ vmull.p64 q0,d0,d0
|
||||
.byte 0x1e,0xff,0x2f,0xe1 @ bx lr
|
||||
.size _armv8_pmull_probe,.-_armv8_pmull_probe
|
||||
|
||||
.align 5
|
||||
|
@ -56,7 +60,7 @@ OPENSSL_atomic_add:
|
|||
cmp r2,#0
|
||||
bne .Ladd
|
||||
mov r0,r3
|
||||
.word 0xe12fff1e @ bx lr
|
||||
bx lr
|
||||
#else
|
||||
stmdb sp!,{r4-r6,lr}
|
||||
ldr r2,.Lspinlock
|
||||
|
@ -109,9 +113,13 @@ OPENSSL_cleanse:
|
|||
adds r1,r1,#4
|
||||
bne .Little
|
||||
.Lcleanse_done:
|
||||
#if __ARM_ARCH__>=5
|
||||
bx lr
|
||||
#else
|
||||
tst lr,#1
|
||||
moveq pc,lr
|
||||
.word 0xe12fff1e @ bx lr
|
||||
#endif
|
||||
.size OPENSSL_cleanse,.-OPENSSL_cleanse
|
||||
|
||||
.global OPENSSL_wipe_cpu
|
||||
|
@ -125,41 +133,53 @@ OPENSSL_wipe_cpu:
|
|||
eor ip,ip,ip
|
||||
tst r0,#1
|
||||
beq .Lwipe_done
|
||||
.word 0xf3000150 @ veor q0, q0, q0
|
||||
.word 0xf3022152 @ veor q1, q1, q1
|
||||
.word 0xf3044154 @ veor q2, q2, q2
|
||||
.word 0xf3066156 @ veor q3, q3, q3
|
||||
.word 0xf34001f0 @ veor q8, q8, q8
|
||||
.word 0xf34221f2 @ veor q9, q9, q9
|
||||
.word 0xf34441f4 @ veor q10, q10, q10
|
||||
.word 0xf34661f6 @ veor q11, q11, q11
|
||||
.word 0xf34881f8 @ veor q12, q12, q12
|
||||
.word 0xf34aa1fa @ veor q13, q13, q13
|
||||
.word 0xf34cc1fc @ veor q14, q14, q14
|
||||
.word 0xf34ee1fe @ veor q15, q15, q15
|
||||
.byte 0x50,0x01,0x00,0xf3 @ veor q0, q0, q0
|
||||
.byte 0x52,0x21,0x02,0xf3 @ veor q1, q1, q1
|
||||
.byte 0x54,0x41,0x04,0xf3 @ veor q2, q2, q2
|
||||
.byte 0x56,0x61,0x06,0xf3 @ veor q3, q3, q3
|
||||
.byte 0xf0,0x01,0x40,0xf3 @ veor q8, q8, q8
|
||||
.byte 0xf2,0x21,0x42,0xf3 @ veor q9, q9, q9
|
||||
.byte 0xf4,0x41,0x44,0xf3 @ veor q10, q10, q10
|
||||
.byte 0xf6,0x61,0x46,0xf3 @ veor q11, q11, q11
|
||||
.byte 0xf8,0x81,0x48,0xf3 @ veor q12, q12, q12
|
||||
.byte 0xfa,0xa1,0x4a,0xf3 @ veor q13, q13, q13
|
||||
.byte 0xfc,0xc1,0x4c,0xf3 @ veor q14, q14, q14
|
||||
.byte 0xfe,0xe1,0x4e,0xf3 @ veor q14, q14, q14
|
||||
.Lwipe_done:
|
||||
mov r0,sp
|
||||
#if __ARM_ARCH__>=5
|
||||
bx lr
|
||||
#else
|
||||
tst lr,#1
|
||||
moveq pc,lr
|
||||
.word 0xe12fff1e @ bx lr
|
||||
#endif
|
||||
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
|
||||
|
||||
.global OPENSSL_instrument_bus
|
||||
.type OPENSSL_instrument_bus,%function
|
||||
OPENSSL_instrument_bus:
|
||||
eor r0,r0,r0
|
||||
#if __ARM_ARCH__>=5
|
||||
bx lr
|
||||
#else
|
||||
tst lr,#1
|
||||
moveq pc,lr
|
||||
.word 0xe12fff1e @ bx lr
|
||||
#endif
|
||||
.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
|
||||
|
||||
.global OPENSSL_instrument_bus2
|
||||
.type OPENSSL_instrument_bus2,%function
|
||||
OPENSSL_instrument_bus2:
|
||||
eor r0,r0,r0
|
||||
#if __ARM_ARCH__>=5
|
||||
bx lr
|
||||
#else
|
||||
tst lr,#1
|
||||
moveq pc,lr
|
||||
.word 0xe12fff1e @ bx lr
|
||||
#endif
|
||||
.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
|
||||
|
||||
.align 5
|
||||
|
|
|
@ -202,7 +202,7 @@ bn_GF2m_mul_2x2:
|
|||
veor $r, $r, $t2
|
||||
|
||||
vst1.32 {$r}, [r0]
|
||||
bx lr
|
||||
ret @ bx lr
|
||||
.align 4
|
||||
.Lialu:
|
||||
#endif
|
||||
|
@ -273,6 +273,7 @@ foreach (split("\n",$code)) {
|
|||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||||
s/\bret\b/bx lr/go or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,"\n";
|
||||
|
|
|
@ -230,9 +230,14 @@ bn_mul_mont:
|
|||
ldmia sp!,{r4-r12,lr} @ restore registers
|
||||
add sp,sp,#2*4 @ skip over {r0,r2}
|
||||
mov r0,#1
|
||||
.Labrt: tst lr,#1
|
||||
.Labrt:
|
||||
#if __ARM_ARCH__>=5
|
||||
ret @ bx lr
|
||||
#else
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size bn_mul_mont,.-bn_mul_mont
|
||||
___
|
||||
{
|
||||
|
@ -650,7 +655,7 @@ bn_mul8x_mont_neon:
|
|||
sub sp,ip,#96
|
||||
vldmia sp!,{d8-d15}
|
||||
ldmia sp!,{r4-r11}
|
||||
bx lr
|
||||
ret @ bx lr
|
||||
.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
|
||||
#endif
|
||||
___
|
||||
|
@ -665,5 +670,6 @@ ___
|
|||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||||
$code =~ s/\bret\b/bx lr/gm;
|
||||
print $code;
|
||||
close STDOUT;
|
||||
|
|
|
@ -386,7 +386,7 @@ gcm_init_neon:
|
|||
veor $IN,$IN,$t0 @ twisted H
|
||||
vstmia r0,{$IN}
|
||||
|
||||
bx lr
|
||||
ret @ bx lr
|
||||
.size gcm_init_neon,.-gcm_init_neon
|
||||
|
||||
.global gcm_gmult_neon
|
||||
|
@ -470,7 +470,7 @@ $code.=<<___;
|
|||
vst1.64 $Xl#hi,[$Xi,:64]! @ write out Xi
|
||||
vst1.64 $Xl#lo,[$Xi,:64]
|
||||
|
||||
bx lr
|
||||
ret @ bx lr
|
||||
.size gcm_ghash_neon,.-gcm_ghash_neon
|
||||
#endif
|
||||
___
|
||||
|
@ -484,6 +484,7 @@ foreach (split("\n",$code)) {
|
|||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||||
s/\bret\b/bx lr/go or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,"\n";
|
||||
|
|
|
@ -631,7 +631,7 @@ $code.=<<___;
|
|||
vst1.32 {$E\[0]},[$ctx]
|
||||
|
||||
vldmia sp!,{d8-d15}
|
||||
bx lr
|
||||
ret @ bx lr
|
||||
.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
|
||||
#endif
|
||||
___
|
||||
|
@ -648,13 +648,18 @@ ___
|
|||
sub unsha1 {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
$arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
|
||||
&&
|
||||
sprintf ".long\t0x%08x\t@ %s %s",
|
||||
$opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
||||
|(($2&7)<<17)|(($2&8)<<4)
|
||||
|(($3&7)<<1) |(($3&8)<<2),
|
||||
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
|
||||
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
||||
|(($2&7)<<17)|(($2&8)<<4)
|
||||
|(($3&7)<<1) |(($3&8)<<2);
|
||||
# since ARMv7 instructions are always encoded little-endian.
|
||||
# correct solution is to use .inst directive, but older
|
||||
# assemblers don't implement it:-(
|
||||
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||||
$word&0xff,($word>>8)&0xff,
|
||||
($word>>16)&0xff,($word>>24)&0xff,
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -664,6 +669,7 @@ foreach (split($/,$code)) {
|
|||
|
||||
s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
|
||||
|
||||
s/\bret\b/bx lr/o or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,$/;
|
||||
|
|
|
@ -608,7 +608,7 @@ $code.=<<___;
|
|||
|
||||
vst1.32 {$ABCD,$EFGH},[$ctx]
|
||||
|
||||
bx lr
|
||||
ret @ bx lr
|
||||
.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
|
||||
#endif
|
||||
___
|
||||
|
@ -626,13 +626,18 @@ ___
|
|||
sub unsha256 {
|
||||
my ($mnemonic,$arg)=@_;
|
||||
|
||||
$arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
|
||||
&&
|
||||
sprintf ".long\t0x%08x\t@ %s %s",
|
||||
$opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
||||
|(($2&7)<<17)|(($2&8)<<4)
|
||||
|(($3&7)<<1) |(($3&8)<<2),
|
||||
if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
|
||||
my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
|
||||
|(($2&7)<<17)|(($2&8)<<4)
|
||||
|(($3&7)<<1) |(($3&8)<<2);
|
||||
# since ARMv7 instructions are always encoded little-endian.
|
||||
# correct solution is to use .inst directive, but older
|
||||
# assemblers don't implement it:-(
|
||||
sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
|
||||
$word&0xff,($word>>8)&0xff,
|
||||
($word>>16)&0xff,($word>>24)&0xff,
|
||||
$mnemonic,$arg;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -642,6 +647,7 @@ foreach (split($/,$code)) {
|
|||
|
||||
s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
|
||||
|
||||
s/\bret\b/bx lr/go or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,"\n";
|
||||
|
|
|
@ -584,7 +584,7 @@ $code.=<<___;
|
|||
bne .Loop_neon
|
||||
|
||||
vldmia sp!,{d8-d15} @ epilogue
|
||||
bx lr
|
||||
ret @ bx lr
|
||||
#endif
|
||||
___
|
||||
}
|
||||
|
@ -597,5 +597,6 @@ ___
|
|||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||||
$code =~ s/\bret\b/bx lr/gm;
|
||||
print $code;
|
||||
close STDOUT; # enforce flush
|
||||
|
|
Loading…
Reference in a new issue