Sync ASM/modes to add CCM and XTS modes and assembly language optimisation
(from HEAD, original by Andy).
This commit is contained in:
parent
5435d0412f
commit
dc01af7723
17 changed files with 5774 additions and 58 deletions
|
@ -10,15 +10,21 @@ CFLAG=-g
|
|||
MAKEFILE= Makefile
|
||||
AR= ar r
|
||||
|
||||
MODES_ASM_OBJ=
|
||||
|
||||
CFLAGS= $(INCLUDES) $(CFLAG)
|
||||
ASFLAGS= $(INCLUDES) $(ASFLAG)
|
||||
AFLAGS= $(ASFLAGS)
|
||||
|
||||
GENERAL=Makefile
|
||||
TEST=
|
||||
APPS=
|
||||
|
||||
LIB=$(TOP)/libcrypto.a
|
||||
LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c
|
||||
LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o
|
||||
LIBSRC= cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \
|
||||
ccm128.c xts128.c
|
||||
LIBOBJ= cbc128.o ctr128.o cts128.o cfb128.o ofb128.o gcm128.o \
|
||||
ccm128.o xts128.o $(MODES_ASM_OBJ)
|
||||
|
||||
SRC= $(LIBSRC)
|
||||
|
||||
|
@ -38,6 +44,24 @@ lib: $(LIBOBJ)
|
|||
$(RANLIB) $(LIB) || echo Never mind.
|
||||
@touch lib
|
||||
|
||||
ghash-ia64.s: asm/ghash-ia64.pl
|
||||
$(PERL) asm/ghash-ia64.pl $@ $(CFLAGS)
|
||||
ghash-x86.s: asm/ghash-x86.pl
|
||||
$(PERL) asm/ghash-x86.pl $(PERLASM_SCHEME) $(CFLAGS) $(PROCESSOR) > $@
|
||||
ghash-x86_64.s: asm/ghash-x86_64.pl
|
||||
$(PERL) asm/ghash-x86_64.pl $(PERLASM_SCHEME) > $@
|
||||
ghash-sparcv9.s: asm/ghash-sparcv9.pl
|
||||
$(PERL) asm/ghash-sparcv9.pl $@ $(CFLAGS)
|
||||
ghash-alpha.s: asm/ghash-alpha.pl
|
||||
$(PERL) $< | $(CC) -E - | tee $@ > /dev/null
|
||||
ghash-parisc.s: asm/ghash-parisc.pl
|
||||
$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
|
||||
|
||||
# GNU make "catch all"
|
||||
ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
|
||||
|
||||
ghash-armv4.o: ghash-armv4.S
|
||||
|
||||
files:
|
||||
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
|
||||
|
||||
|
@ -71,12 +95,47 @@ dclean:
|
|||
mv -f Makefile.new $(MAKEFILE)
|
||||
|
||||
clean:
|
||||
rm -f *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
|
||||
rm -f *.s *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
|
||||
|
||||
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
||||
|
||||
cbc128.o: cbc128.c modes.h
|
||||
cfb128.o: cfb128.c modes.h
|
||||
ctr128.o: ctr128.c modes.h
|
||||
cts128.o: cts128.c modes.h
|
||||
ofb128.o: modes.h ofb128.c
|
||||
cbc128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
|
||||
cbc128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
|
||||
cbc128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
|
||||
cbc128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
|
||||
cbc128.o: ../../include/openssl/symhacks.h cbc128.c modes_lcl.h
|
||||
ccm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
|
||||
ccm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
|
||||
ccm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
|
||||
ccm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
|
||||
ccm128.o: ../../include/openssl/symhacks.h ccm128.c modes_lcl.h
|
||||
cfb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
|
||||
cfb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
|
||||
cfb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
|
||||
cfb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
|
||||
cfb128.o: ../../include/openssl/symhacks.h cfb128.c modes_lcl.h
|
||||
ctr128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
|
||||
ctr128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
|
||||
ctr128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
|
||||
ctr128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
|
||||
ctr128.o: ../../include/openssl/symhacks.h ctr128.c modes_lcl.h
|
||||
cts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
|
||||
cts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
|
||||
cts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
|
||||
cts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
|
||||
cts128.o: ../../include/openssl/symhacks.h cts128.c modes_lcl.h
|
||||
gcm128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
|
||||
gcm128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
|
||||
gcm128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
|
||||
gcm128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
|
||||
gcm128.o: ../../include/openssl/symhacks.h gcm128.c modes_lcl.h
|
||||
ofb128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
|
||||
ofb128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
|
||||
ofb128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
|
||||
ofb128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
|
||||
ofb128.o: ../../include/openssl/symhacks.h modes_lcl.h ofb128.c
|
||||
xts128.o: ../../include/openssl/crypto.h ../../include/openssl/e_os2.h
|
||||
xts128.o: ../../include/openssl/modes.h ../../include/openssl/opensslconf.h
|
||||
xts128.o: ../../include/openssl/opensslv.h ../../include/openssl/ossl_typ.h
|
||||
xts128.o: ../../include/openssl/safestack.h ../../include/openssl/stack.h
|
||||
xts128.o: ../../include/openssl/symhacks.h modes_lcl.h xts128.c
|
||||
|
|
451
crypto/modes/asm/ghash-alpha.pl
Normal file
451
crypto/modes/asm/ghash-alpha.pl
Normal file
|
@ -0,0 +1,451 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# March 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+128 bytes shared table]. Even though
|
||||
# loops are aggressively modulo-scheduled in respect to references to
|
||||
# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
|
||||
# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
|
||||
# scheduling "glitch," because uprofile(1) indicates uniform sample
|
||||
# distribution, as if all instruction bundles execute in 1.5 cycles.
|
||||
# Meaning that it could have been even faster, yet 12 cycles is ~60%
|
||||
# better than gcc-generated code and ~80% than code generated by vendor
|
||||
# compiler.
|
||||
|
||||
$cnt="v0"; # $0
|
||||
$t0="t0";
|
||||
$t1="t1";
|
||||
$t2="t2";
|
||||
$Thi0="t3"; # $4
|
||||
$Tlo0="t4";
|
||||
$Thi1="t5";
|
||||
$Tlo1="t6";
|
||||
$rem="t7"; # $8
|
||||
#################
|
||||
$Xi="a0"; # $16, input argument block
|
||||
$Htbl="a1";
|
||||
$inp="a2";
|
||||
$len="a3";
|
||||
$nlo="a4"; # $20
|
||||
$nhi="a5";
|
||||
$Zhi="t8";
|
||||
$Zlo="t9";
|
||||
$Xhi="t10"; # $24
|
||||
$Xlo="t11";
|
||||
$remp="t12";
|
||||
$rem_4bit="AT"; # $28
|
||||
|
||||
{ my $N;
|
||||
sub loop() {
|
||||
|
||||
$N++;
|
||||
$code.=<<___;
|
||||
.align 4
|
||||
extbl $Xlo,7,$nlo
|
||||
and $nlo,0xf0,$nhi
|
||||
sll $nlo,4,$nlo
|
||||
and $nlo,0xf0,$nlo
|
||||
|
||||
addq $nlo,$Htbl,$nlo
|
||||
ldq $Zlo,8($nlo)
|
||||
addq $nhi,$Htbl,$nhi
|
||||
ldq $Zhi,0($nlo)
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
lda $cnt,6(zero)
|
||||
extbl $Xlo,6,$nlo
|
||||
|
||||
ldq $Tlo1,8($nhi)
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
ldq $Thi1,0($nhi)
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
and $nlo,0xf0,$nhi
|
||||
|
||||
xor $Tlo1,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
xor $Thi1,$Zhi,$Zhi
|
||||
and $nlo,0xf0,$nlo
|
||||
|
||||
addq $nlo,$Htbl,$nlo
|
||||
ldq $Tlo0,8($nlo)
|
||||
addq $nhi,$Htbl,$nhi
|
||||
ldq $Thi0,0($nlo)
|
||||
|
||||
.Looplo$N:
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
subq $cnt,1,$cnt
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
ldq $Tlo1,8($nhi)
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldq $Thi1,0($nhi)
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
extbl $Xlo,$cnt,$nlo
|
||||
|
||||
and $nlo,0xf0,$nhi
|
||||
xor $Thi0,$Zhi,$Zhi
|
||||
xor $Tlo0,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
and $nlo,0xf0,$nlo
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
xor $rem,$Zhi,$Zhi
|
||||
addq $nlo,$Htbl,$nlo
|
||||
addq $nhi,$Htbl,$nhi
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
ldq $Tlo0,8($nlo)
|
||||
xor $t0,$Zlo,$Zlo
|
||||
|
||||
xor $Tlo1,$Zlo,$Zlo
|
||||
xor $Thi1,$Zhi,$Zhi
|
||||
ldq $Thi0,0($nlo)
|
||||
bne $cnt,.Looplo$N
|
||||
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
lda $cnt,7(zero)
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
ldq $Tlo1,8($nhi)
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldq $Thi1,0($nhi)
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
extbl $Xhi,$cnt,$nlo
|
||||
|
||||
and $nlo,0xf0,$nhi
|
||||
xor $Thi0,$Zhi,$Zhi
|
||||
xor $Tlo0,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
and $nlo,0xf0,$nlo
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
xor $rem,$Zhi,$Zhi
|
||||
addq $nlo,$Htbl,$nlo
|
||||
addq $nhi,$Htbl,$nhi
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
ldq $Tlo0,8($nlo)
|
||||
xor $t0,$Zlo,$Zlo
|
||||
|
||||
xor $Tlo1,$Zlo,$Zlo
|
||||
xor $Thi1,$Zhi,$Zhi
|
||||
ldq $Thi0,0($nlo)
|
||||
unop
|
||||
|
||||
|
||||
.Loophi$N:
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
subq $cnt,1,$cnt
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
ldq $Tlo1,8($nhi)
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldq $Thi1,0($nhi)
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
extbl $Xhi,$cnt,$nlo
|
||||
|
||||
and $nlo,0xf0,$nhi
|
||||
xor $Thi0,$Zhi,$Zhi
|
||||
xor $Tlo0,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
and $nlo,0xf0,$nlo
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
xor $rem,$Zhi,$Zhi
|
||||
addq $nlo,$Htbl,$nlo
|
||||
addq $nhi,$Htbl,$nhi
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
ldq $Tlo0,8($nlo)
|
||||
xor $t0,$Zlo,$Zlo
|
||||
|
||||
xor $Tlo1,$Zlo,$Zlo
|
||||
xor $Thi1,$Zhi,$Zhi
|
||||
ldq $Thi0,0($nlo)
|
||||
bne $cnt,.Loophi$N
|
||||
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
ldq $Tlo1,8($nhi)
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldq $Thi1,0($nhi)
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
|
||||
xor $Tlo0,$Zlo,$Zlo
|
||||
xor $Thi0,$Zhi,$Zhi
|
||||
|
||||
and $Zlo,0x0f,$remp
|
||||
sll $Zhi,60,$t0
|
||||
srl $Zlo,4,$Zlo
|
||||
|
||||
s8addq $remp,$rem_4bit,$remp
|
||||
xor $rem,$Zhi,$Zhi
|
||||
|
||||
ldq $rem,0($remp)
|
||||
srl $Zhi,4,$Zhi
|
||||
xor $Tlo1,$Zlo,$Zlo
|
||||
xor $Thi1,$Zhi,$Zhi
|
||||
xor $t0,$Zlo,$Zlo
|
||||
xor $rem,$Zhi,$Zhi
|
||||
___
|
||||
}}
|
||||
|
||||
$code=<<___;
|
||||
#ifdef __linux__
|
||||
#include <asm/regdef.h>
|
||||
#else
|
||||
#include <asm.h>
|
||||
#include <regdef.h>
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
.set noat
|
||||
.set noreorder
|
||||
.globl gcm_gmult_4bit
|
||||
.align 4
|
||||
.ent gcm_gmult_4bit
|
||||
gcm_gmult_4bit:
|
||||
.frame sp,0,ra
|
||||
.prologue 0
|
||||
|
||||
ldq $Xlo,8($Xi)
|
||||
ldq $Xhi,0($Xi)
|
||||
|
||||
br $rem_4bit,.Lpic1
|
||||
.Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
|
||||
___
|
||||
|
||||
&loop();
|
||||
|
||||
$code.=<<___;
|
||||
srl $Zlo,24,$t0 # byte swap
|
||||
srl $Zlo,8,$t1
|
||||
|
||||
sll $Zlo,8,$t2
|
||||
sll $Zlo,24,$Zlo
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t1,0x22,$t1
|
||||
|
||||
zapnot $Zlo,0x88,$Zlo
|
||||
or $t0,$t1,$t0
|
||||
zapnot $t2,0x44,$t2
|
||||
|
||||
or $Zlo,$t0,$Zlo
|
||||
srl $Zhi,24,$t0
|
||||
srl $Zhi,8,$t1
|
||||
|
||||
or $Zlo,$t2,$Zlo
|
||||
sll $Zhi,8,$t2
|
||||
sll $Zhi,24,$Zhi
|
||||
|
||||
srl $Zlo,32,$Xlo
|
||||
sll $Zlo,32,$Zlo
|
||||
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t1,0x22,$t1
|
||||
or $Zlo,$Xlo,$Xlo
|
||||
|
||||
zapnot $Zhi,0x88,$Zhi
|
||||
or $t0,$t1,$t0
|
||||
zapnot $t2,0x44,$t2
|
||||
|
||||
or $Zhi,$t0,$Zhi
|
||||
or $Zhi,$t2,$Zhi
|
||||
|
||||
srl $Zhi,32,$Xhi
|
||||
sll $Zhi,32,$Zhi
|
||||
|
||||
or $Zhi,$Xhi,$Xhi
|
||||
stq $Xlo,8($Xi)
|
||||
stq $Xhi,0($Xi)
|
||||
|
||||
ret (ra)
|
||||
.end gcm_gmult_4bit
|
||||
___
|
||||
|
||||
$inhi="s0";
|
||||
$inlo="s1";
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_ghash_4bit
|
||||
.align 4
|
||||
.ent gcm_ghash_4bit
|
||||
gcm_ghash_4bit:
|
||||
lda sp,-32(sp)
|
||||
stq ra,0(sp)
|
||||
stq s0,8(sp)
|
||||
stq s1,16(sp)
|
||||
.mask 0x04000600,-32
|
||||
.frame sp,32,ra
|
||||
.prologue 0
|
||||
|
||||
ldq_u $inhi,0($inp)
|
||||
ldq_u $Thi0,7($inp)
|
||||
ldq_u $inlo,8($inp)
|
||||
ldq_u $Tlo0,15($inp)
|
||||
ldq $Xhi,0($Xi)
|
||||
ldq $Xlo,8($Xi)
|
||||
|
||||
br $rem_4bit,.Lpic2
|
||||
.Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
|
||||
|
||||
.Louter:
|
||||
extql $inhi,$inp,$inhi
|
||||
extqh $Thi0,$inp,$Thi0
|
||||
or $inhi,$Thi0,$inhi
|
||||
lda $inp,16($inp)
|
||||
|
||||
extql $inlo,$inp,$inlo
|
||||
extqh $Tlo0,$inp,$Tlo0
|
||||
or $inlo,$Tlo0,$inlo
|
||||
subq $len,16,$len
|
||||
|
||||
xor $Xlo,$inlo,$Xlo
|
||||
xor $Xhi,$inhi,$Xhi
|
||||
___
|
||||
|
||||
&loop();
|
||||
|
||||
$code.=<<___;
|
||||
srl $Zlo,24,$t0 # byte swap
|
||||
srl $Zlo,8,$t1
|
||||
|
||||
sll $Zlo,8,$t2
|
||||
sll $Zlo,24,$Zlo
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t1,0x22,$t1
|
||||
|
||||
zapnot $Zlo,0x88,$Zlo
|
||||
or $t0,$t1,$t0
|
||||
zapnot $t2,0x44,$t2
|
||||
|
||||
or $Zlo,$t0,$Zlo
|
||||
srl $Zhi,24,$t0
|
||||
srl $Zhi,8,$t1
|
||||
|
||||
or $Zlo,$t2,$Zlo
|
||||
sll $Zhi,8,$t2
|
||||
sll $Zhi,24,$Zhi
|
||||
|
||||
srl $Zlo,32,$Xlo
|
||||
sll $Zlo,32,$Zlo
|
||||
beq $len,.Ldone
|
||||
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t1,0x22,$t1
|
||||
or $Zlo,$Xlo,$Xlo
|
||||
ldq_u $inhi,0($inp)
|
||||
|
||||
zapnot $Zhi,0x88,$Zhi
|
||||
or $t0,$t1,$t0
|
||||
zapnot $t2,0x44,$t2
|
||||
ldq_u $Thi0,7($inp)
|
||||
|
||||
or $Zhi,$t0,$Zhi
|
||||
or $Zhi,$t2,$Zhi
|
||||
ldq_u $inlo,8($inp)
|
||||
ldq_u $Tlo0,15($inp)
|
||||
|
||||
srl $Zhi,32,$Xhi
|
||||
sll $Zhi,32,$Zhi
|
||||
|
||||
or $Zhi,$Xhi,$Xhi
|
||||
br zero,.Louter
|
||||
|
||||
.Ldone:
|
||||
zapnot $t0,0x11,$t0
|
||||
zapnot $t1,0x22,$t1
|
||||
or $Zlo,$Xlo,$Xlo
|
||||
|
||||
zapnot $Zhi,0x88,$Zhi
|
||||
or $t0,$t1,$t0
|
||||
zapnot $t2,0x44,$t2
|
||||
|
||||
or $Zhi,$t0,$Zhi
|
||||
or $Zhi,$t2,$Zhi
|
||||
|
||||
srl $Zhi,32,$Xhi
|
||||
sll $Zhi,32,$Zhi
|
||||
|
||||
or $Zhi,$Xhi,$Xhi
|
||||
|
||||
stq $Xlo,8($Xi)
|
||||
stq $Xhi,0($Xi)
|
||||
|
||||
.set noreorder
|
||||
/*ldq ra,0(sp)*/
|
||||
ldq s0,8(sp)
|
||||
ldq s1,16(sp)
|
||||
lda sp,32(sp)
|
||||
ret (ra)
|
||||
.end gcm_ghash_4bit
|
||||
|
||||
.align 4
|
||||
rem_4bit:
|
||||
.quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
|
||||
.quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
|
||||
.quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
|
||||
.quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
|
||||
.ascii "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 4
|
||||
|
||||
___
|
||||
$output=shift and open STDOUT,">$output";
|
||||
print $code;
|
||||
close STDOUT;
|
||||
|
429
crypto/modes/asm/ghash-armv4.pl
Normal file
429
crypto/modes/asm/ghash-armv4.pl
Normal file
|
@ -0,0 +1,429 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# April 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+32 bytes shared table]. There is no
|
||||
# experimental performance data available yet. The only approximation
|
||||
# that can be made at this point is based on code size. Inner loop is
|
||||
# 32 instructions long and on single-issue core should execute in <40
|
||||
# cycles. Having verified that gcc 3.4 didn't unroll corresponding
|
||||
# loop, this assembler loop body was found to be ~3x smaller than
|
||||
# compiler-generated one...
|
||||
#
|
||||
# July 2010
|
||||
#
|
||||
# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
|
||||
# Cortex A8 core and ~25 cycles per processed byte (which was observed
|
||||
# to be ~3 times faster than gcc-generated code:-)
|
||||
#
|
||||
# February 2011
|
||||
#
|
||||
# Profiler-assisted and platform-specific optimization resulted in 7%
|
||||
# improvement on Cortex A8 core and ~23.5 cycles per byte.
|
||||
#
|
||||
# March 2011
|
||||
#
|
||||
# Add NEON implementation featuring polynomial multiplication, i.e. no
|
||||
# lookup tables involved. On Cortex A8 it was measured to process one
|
||||
# byte in 15 cycles or 55% faster than integer-only code.
|
||||
|
||||
# ====================================================================
|
||||
# Note about "528B" variant. In ARM case it makes lesser sense to
|
||||
# implement it for following reasons:
|
||||
#
|
||||
# - performance improvement won't be anywhere near 50%, because 128-
|
||||
# bit shift operation is neatly fused with 128-bit xor here, and
|
||||
# "538B" variant would eliminate only 4-5 instructions out of 32
|
||||
# in the inner loop (meaning that estimated improvement is ~15%);
|
||||
# - ARM-based systems are often embedded ones and extra memory
|
||||
# consumption might be unappreciated (for so little improvement);
|
||||
#
|
||||
# Byte order [in]dependence. =========================================
|
||||
#
|
||||
# Caller is expected to maintain specific *dword* order in Htable,
|
||||
# namely with *least* significant dword of 128-bit value at *lower*
|
||||
# address. This differs completely from C code and has everything to
|
||||
# do with ldm instruction and order in which dwords are "consumed" by
|
||||
# algorithm. *Byte* order within these dwords in turn is whatever
|
||||
# *native* byte order on current platform. See gcm128.c for working
|
||||
# example...
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$Xi="r0"; # argument block
|
||||
$Htbl="r1";
|
||||
$inp="r2";
|
||||
$len="r3";
|
||||
|
||||
$Zll="r4"; # variables
|
||||
$Zlh="r5";
|
||||
$Zhl="r6";
|
||||
$Zhh="r7";
|
||||
$Tll="r8";
|
||||
$Tlh="r9";
|
||||
$Thl="r10";
|
||||
$Thh="r11";
|
||||
$nlo="r12";
|
||||
################# r13 is stack pointer
|
||||
$nhi="r14";
|
||||
################# r15 is program counter
|
||||
|
||||
$rem_4bit=$inp; # used in gcm_gmult_4bit
|
||||
$cnt=$len;
|
||||
|
||||
sub Zsmash() {
|
||||
my $i=12;
|
||||
my @args=@_;
|
||||
for ($Zll,$Zlh,$Zhl,$Zhh) {
|
||||
$code.=<<___;
|
||||
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
|
||||
rev $_,$_
|
||||
str $_,[$Xi,#$i]
|
||||
#elif defined(__ARMEB__)
|
||||
str $_,[$Xi,#$i]
|
||||
#else
|
||||
mov $Tlh,$_,lsr#8
|
||||
strb $_,[$Xi,#$i+3]
|
||||
mov $Thl,$_,lsr#16
|
||||
strb $Tlh,[$Xi,#$i+2]
|
||||
mov $Thh,$_,lsr#24
|
||||
strb $Thl,[$Xi,#$i+1]
|
||||
strb $Thh,[$Xi,#$i]
|
||||
#endif
|
||||
___
|
||||
$code.="\t".shift(@args)."\n";
|
||||
$i-=4;
|
||||
}
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
.text
|
||||
.code 32
|
||||
|
||||
.type rem_4bit,%object
|
||||
.align 5
|
||||
rem_4bit:
|
||||
.short 0x0000,0x1C20,0x3840,0x2460
|
||||
.short 0x7080,0x6CA0,0x48C0,0x54E0
|
||||
.short 0xE100,0xFD20,0xD940,0xC560
|
||||
.short 0x9180,0x8DA0,0xA9C0,0xB5E0
|
||||
.size rem_4bit,.-rem_4bit
|
||||
|
||||
.type rem_4bit_get,%function
|
||||
rem_4bit_get:
|
||||
sub $rem_4bit,pc,#8
|
||||
sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
|
||||
b .Lrem_4bit_got
|
||||
nop
|
||||
.size rem_4bit_get,.-rem_4bit_get
|
||||
|
||||
.global gcm_ghash_4bit
|
||||
.type gcm_ghash_4bit,%function
|
||||
gcm_ghash_4bit:
|
||||
sub r12,pc,#8
|
||||
add $len,$inp,$len @ $len to point at the end
|
||||
stmdb sp!,{r3-r11,lr} @ save $len/end too
|
||||
sub r12,r12,#48 @ &rem_4bit
|
||||
|
||||
ldmia r12,{r4-r11} @ copy rem_4bit ...
|
||||
stmdb sp!,{r4-r11} @ ... to stack
|
||||
|
||||
ldrb $nlo,[$inp,#15]
|
||||
ldrb $nhi,[$Xi,#15]
|
||||
.Louter:
|
||||
eor $nlo,$nlo,$nhi
|
||||
and $nhi,$nlo,#0xf0
|
||||
and $nlo,$nlo,#0x0f
|
||||
mov $cnt,#14
|
||||
|
||||
add $Zhh,$Htbl,$nlo,lsl#4
|
||||
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
|
||||
add $Thh,$Htbl,$nhi
|
||||
ldrb $nlo,[$inp,#14]
|
||||
|
||||
and $nhi,$Zll,#0xf @ rem
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
||||
add $nhi,$nhi,$nhi
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
ldrb $nhi,[$Xi,#14]
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
eor $nlo,$nlo,$nhi
|
||||
and $nhi,$nlo,#0xf0
|
||||
and $nlo,$nlo,#0x0f
|
||||
eor $Zhh,$Zhh,$Tll,lsl#16
|
||||
|
||||
.Linner:
|
||||
add $Thh,$Htbl,$nlo,lsl#4
|
||||
and $nlo,$Zll,#0xf @ rem
|
||||
subs $cnt,$cnt,#1
|
||||
add $nlo,$nlo,$nlo
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
ldrplb $nlo,[$inp,$cnt]
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
|
||||
add $Thh,$Htbl,$nhi
|
||||
and $nhi,$Zll,#0xf @ rem
|
||||
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
|
||||
add $nhi,$nhi,$nhi
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
ldrplb $Tll,[$Xi,$cnt]
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
ldrh $Tlh,[sp,$nhi]
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eorpl $nlo,$nlo,$Tll
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
andpl $nhi,$nlo,#0xf0
|
||||
andpl $nlo,$nlo,#0x0f
|
||||
eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
|
||||
bpl .Linner
|
||||
|
||||
ldr $len,[sp,#32] @ re-load $len/end
|
||||
add $inp,$inp,#16
|
||||
mov $nhi,$Zll
|
||||
___
|
||||
&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
|
||||
$code.=<<___;
|
||||
bne .Louter
|
||||
|
||||
add sp,sp,#36
|
||||
#if __ARM_ARCH__>=5
|
||||
ldmia sp!,{r4-r11,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r11,lr}
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size gcm_ghash_4bit,.-gcm_ghash_4bit
|
||||
|
||||
.global gcm_gmult_4bit
|
||||
.type gcm_gmult_4bit,%function
|
||||
gcm_gmult_4bit:
|
||||
stmdb sp!,{r4-r11,lr}
|
||||
ldrb $nlo,[$Xi,#15]
|
||||
b rem_4bit_get
|
||||
.Lrem_4bit_got:
|
||||
and $nhi,$nlo,#0xf0
|
||||
and $nlo,$nlo,#0x0f
|
||||
mov $cnt,#14
|
||||
|
||||
add $Zhh,$Htbl,$nlo,lsl#4
|
||||
ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
|
||||
ldrb $nlo,[$Xi,#14]
|
||||
|
||||
add $Thh,$Htbl,$nhi
|
||||
and $nhi,$Zll,#0xf @ rem
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
||||
add $nhi,$nhi,$nhi
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
and $nhi,$nlo,#0xf0
|
||||
eor $Zhh,$Zhh,$Tll,lsl#16
|
||||
and $nlo,$nlo,#0x0f
|
||||
|
||||
.Loop:
|
||||
add $Thh,$Htbl,$nlo,lsl#4
|
||||
and $nlo,$Zll,#0xf @ rem
|
||||
subs $cnt,$cnt,#1
|
||||
add $nlo,$nlo,$nlo
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
ldrplb $nlo,[$Xi,$cnt]
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
|
||||
add $Thh,$Htbl,$nhi
|
||||
and $nhi,$Zll,#0xf @ rem
|
||||
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
|
||||
add $nhi,$nhi,$nhi
|
||||
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
|
||||
eor $Zll,$Tll,$Zll,lsr#4
|
||||
eor $Zll,$Zll,$Zlh,lsl#28
|
||||
eor $Zlh,$Tlh,$Zlh,lsr#4
|
||||
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
|
||||
eor $Zlh,$Zlh,$Zhl,lsl#28
|
||||
eor $Zhl,$Thl,$Zhl,lsr#4
|
||||
eor $Zhl,$Zhl,$Zhh,lsl#28
|
||||
eor $Zhh,$Thh,$Zhh,lsr#4
|
||||
andpl $nhi,$nlo,#0xf0
|
||||
andpl $nlo,$nlo,#0x0f
|
||||
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
|
||||
bpl .Loop
|
||||
___
|
||||
&Zsmash();
|
||||
$code.=<<___;
|
||||
#if __ARM_ARCH__>=5
|
||||
ldmia sp!,{r4-r11,pc}
|
||||
#else
|
||||
ldmia sp!,{r4-r11,lr}
|
||||
tst lr,#1
|
||||
moveq pc,lr @ be binary compatible with V4, yet
|
||||
bx lr @ interoperable with Thumb ISA:-)
|
||||
#endif
|
||||
.size gcm_gmult_4bit,.-gcm_gmult_4bit
|
||||
___
|
||||
{
|
||||
my $cnt=$Htbl; # $Htbl is used once in the very beginning
|
||||
|
||||
my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
|
||||
my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
|
||||
|
||||
# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
|
||||
# in Zo. Or should I say "top bit", because GHASH is specified in
|
||||
# reverse bit order? Otherwise straightforward 128-bt H by one input
|
||||
# byte multiplication and modulo-reduction, times 16.
|
||||
|
||||
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||||
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||||
sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
|
||||
|
||||
$code.=<<___;
|
||||
#if __ARM_ARCH__>=7
|
||||
.fpu neon
|
||||
|
||||
.global gcm_gmult_neon
|
||||
.type gcm_gmult_neon,%function
|
||||
.align 4
|
||||
gcm_gmult_neon:
|
||||
sub $Htbl,#16 @ point at H in GCM128_CTX
|
||||
vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
|
||||
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
|
||||
vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
|
||||
vshr.u64 $mod,#32
|
||||
vldmia $Htbl,{$Hhi-$Hlo} @ load H
|
||||
veor $zero,$zero
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $IN,$IN
|
||||
#endif
|
||||
veor $Qpost,$Qpost
|
||||
veor $R,$R
|
||||
mov $cnt,#16
|
||||
veor $Z,$Z
|
||||
mov $len,#16
|
||||
veor $Zo,$Zo
|
||||
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
|
||||
b .Linner_neon
|
||||
.size gcm_gmult_neon,.-gcm_gmult_neon
|
||||
|
||||
.global gcm_ghash_neon
|
||||
.type gcm_ghash_neon,%function
|
||||
.align 4
|
||||
gcm_ghash_neon:
|
||||
vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
|
||||
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
|
||||
vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
|
||||
vshr.u64 $mod,#32
|
||||
vldmia $Xi,{$Hhi-$Hlo} @ load H
|
||||
veor $zero,$zero
|
||||
nop
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $Z,$Z
|
||||
#endif
|
||||
.Louter_neon:
|
||||
vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
|
||||
veor $Qpost,$Qpost
|
||||
vld1.64 `&Dlo($IN)`,[$inp]!
|
||||
veor $R,$R
|
||||
mov $cnt,#16
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $IN,$IN
|
||||
#endif
|
||||
veor $Zo,$Zo
|
||||
veor $IN,$Z @ inp^=Xi
|
||||
veor $Z,$Z
|
||||
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
|
||||
.Linner_neon:
|
||||
subs $cnt,$cnt,#1
|
||||
vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
|
||||
vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
|
||||
vext.8 $IN,$zero,#1 @ IN>>=8
|
||||
|
||||
veor $Z,$Qpost @ modulo-scheduled part
|
||||
vshl.i64 `&Dlo("$R")`,#48
|
||||
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
|
||||
veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
|
||||
|
||||
veor `&Dhi("$Z")`,`&Dlo("$R")`
|
||||
vuzp.8 $Qlo,$Qhi
|
||||
vsli.8 $Zo,$T,#1 @ compose the "carry" byte
|
||||
vext.8 $Z,$zero,#1 @ Z>>=8
|
||||
|
||||
vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
|
||||
vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
|
||||
vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
|
||||
veor $Z,$Qhi
|
||||
bne .Linner_neon
|
||||
|
||||
veor $Z,$Qpost @ modulo-scheduled artefact
|
||||
vshl.i64 `&Dlo("$R")`,#48
|
||||
veor `&Dhi("$Z")`,`&Dlo("$R")`
|
||||
|
||||
@ finalization, normalize Z:Zo
|
||||
vand $Zo,$mod @ suffices to mask the bit
|
||||
vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
|
||||
vshl.i64 $Z,#1
|
||||
subs $len,#16
|
||||
vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
|
||||
bne .Louter_neon
|
||||
|
||||
#ifdef __ARMEL__
|
||||
vrev64.8 $Z,$Z
|
||||
#endif
|
||||
sub $Xi,#16
|
||||
vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
|
||||
vst1.64 `&Dlo("$Z")`,[$Xi,:64]
|
||||
|
||||
bx lr
|
||||
.size gcm_ghash_neon,.-gcm_ghash_neon
|
||||
#endif
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||||
print $code;
|
||||
close STDOUT; # enforce flush
|
463
crypto/modes/asm/ghash-ia64.pl
Executable file
463
crypto/modes/asm/ghash-ia64.pl
Executable file
|
@ -0,0 +1,463 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# March 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
|
||||
# GHASH performance was measured to be 6.67 cycles per processed byte
|
||||
# on Itanium 2, which is >90% better than Microsoft compiler generated
|
||||
# code. To anchor to something else sha1-ia64.pl module processes one
|
||||
# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
|
||||
# byte.
|
||||
|
||||
# September 2010
|
||||
#
|
||||
# It was originally thought that it makes lesser sense to implement
|
||||
# "528B" variant on Itanium 2 for following reason. Because number of
|
||||
# functional units is naturally limited, it appeared impossible to
|
||||
# implement "528B" loop in 4 cycles, only in 5. This would mean that
|
||||
# theoretically performance improvement couldn't be more than 20%.
|
||||
# But occasionally you prove yourself wrong:-) I figured out a way to
|
||||
# fold couple of instructions and having freed yet another instruction
|
||||
# slot by unrolling the loop... Resulting performance is 4.45 cycles
|
||||
# per processed byte and 50% better than "256B" version. On original
|
||||
# Itanium performance should remain the same as the "256B" version,
|
||||
# i.e. ~8.5 cycles.
|
||||
|
||||
$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
|
||||
|
||||
if ($^O eq "hpux") {
|
||||
$ADDP="addp4";
|
||||
for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
|
||||
} else { $ADDP="add"; }
|
||||
for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/);
|
||||
$big_endian=0 if (/\-DL_ENDIAN/); }
|
||||
if (!defined($big_endian))
|
||||
{ $big_endian=(unpack('L',pack('N',1))==1); }
|
||||
|
||||
sub loop() {
|
||||
my $label=shift;
|
||||
my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
|
||||
|
||||
# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
|
||||
# in scalable manner;-) Naturally assuming data in L1 cache...
|
||||
# Special note about 'dep' instruction, which is used to construct
|
||||
# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
|
||||
# bytes boundary and lower 7 bits of its address are guaranteed to
|
||||
# be zero.
|
||||
$code.=<<___;
|
||||
$label:
|
||||
{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
|
||||
(p19) dep rem=Zlo,rem_4bitp,3,4 }
|
||||
{ .mfi; (p19) xor Zhi=Zhi,Hhi
|
||||
($p17) xor xi[1]=xi[1],in[1] };;
|
||||
{ .mfi; (p18) ld8 Hhi=[Hi[1]]
|
||||
(p19) shrp Zlo=Zhi,Zlo,4 }
|
||||
{ .mfi; (p19) ld8 rem=[rem]
|
||||
(p18) and Hi[1]=mask0xf0,xi[2] };;
|
||||
{ .mmi; ($p16) ld1 in[0]=[inp],-1
|
||||
(p18) xor Zlo=Zlo,Hlo
|
||||
(p19) shr.u Zhi=Zhi,4 }
|
||||
{ .mib; (p19) xor Hhi=Hhi,rem
|
||||
(p18) add Hi[1]=Htbl,Hi[1] };;
|
||||
|
||||
{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8
|
||||
(p18) dep rem=Zlo,rem_4bitp,3,4 }
|
||||
{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0
|
||||
(p18) xor Zhi=Zhi,Hhi };;
|
||||
{ .mfi; (p18) ld8 Hhi=[Hi[1]]
|
||||
(p18) shrp Zlo=Zhi,Zlo,4 }
|
||||
{ .mfi; (p18) ld8 rem=[rem]
|
||||
(p17) and Hi[0]=mask0xf0,Hi[0] };;
|
||||
{ .mmi; (p16) ld1 xi[0]=[Xi],-1
|
||||
(p18) xor Zlo=Zlo,Hlo
|
||||
(p18) shr.u Zhi=Zhi,4 }
|
||||
{ .mib; (p18) xor Hhi=Hhi,rem
|
||||
(p17) add Hi[0]=Htbl,Hi[0]
|
||||
br.ctop.sptk $label };;
|
||||
___
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
.explicit
|
||||
.text
|
||||
|
||||
prevfs=r2; prevlc=r3; prevpr=r8;
|
||||
mask0xf0=r21;
|
||||
rem=r22; rem_4bitp=r23;
|
||||
Xi=r24; Htbl=r25;
|
||||
inp=r26; end=r27;
|
||||
Hhi=r28; Hlo=r29;
|
||||
Zhi=r30; Zlo=r31;
|
||||
|
||||
.align 128
|
||||
.skip 16 // aligns loop body
|
||||
.global gcm_gmult_4bit#
|
||||
.proc gcm_gmult_4bit#
|
||||
gcm_gmult_4bit:
|
||||
.prologue
|
||||
{ .mmi; .save ar.pfs,prevfs
|
||||
alloc prevfs=ar.pfs,2,6,0,8
|
||||
$ADDP Xi=15,in0 // &Xi[15]
|
||||
mov rem_4bitp=ip }
|
||||
{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo
|
||||
.save ar.lc,prevlc
|
||||
mov prevlc=ar.lc
|
||||
.save pr,prevpr
|
||||
mov prevpr=pr };;
|
||||
|
||||
.body
|
||||
.rotr in[3],xi[3],Hi[2]
|
||||
|
||||
{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15]
|
||||
mov mask0xf0=0xf0
|
||||
brp.loop.imp .Loop1,.Lend1-16};;
|
||||
{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14]
|
||||
};;
|
||||
{ .mii; shladd Hi[1]=xi[2],4,r0
|
||||
mov pr.rot=0x7<<16
|
||||
mov ar.lc=13 };;
|
||||
{ .mii; and Hi[1]=mask0xf0,Hi[1]
|
||||
mov ar.ec=3
|
||||
xor Zlo=Zlo,Zlo };;
|
||||
{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo
|
||||
add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
|
||||
xor Zhi=Zhi,Zhi };;
|
||||
___
|
||||
&loop (".Loop1",1);
|
||||
$code.=<<___;
|
||||
.Lend1:
|
||||
{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact
|
||||
{ .mib; mux1 Zlo=Zlo,\@rev };;
|
||||
{ .mib; mux1 Zhi=Zhi,\@rev };;
|
||||
{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent
|
||||
add Hhi=1,Xi };; // pipeline flush on Itanium
|
||||
{ .mib; st8 [Hlo]=Zlo
|
||||
mov pr=prevpr,0x1ffff };;
|
||||
{ .mib; st8 [Hhi]=Zhi
|
||||
mov ar.lc=prevlc
|
||||
br.ret.sptk.many b0 };;
|
||||
.endp gcm_gmult_4bit#
|
||||
___
|
||||
|
||||
######################################################################
|
||||
# "528B" (well, "512B" actualy) streamed GHASH
|
||||
#
|
||||
$Xip="in0";
|
||||
$Htbl="in1";
|
||||
$inp="in2";
|
||||
$len="in3";
|
||||
$rem_8bit="loc0";
|
||||
$mask0xff="loc1";
|
||||
($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
|
||||
|
||||
sub load_htable() {
|
||||
for (my $i=0;$i<8;$i++) {
|
||||
$code.=<<___;
|
||||
{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi
|
||||
ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo
|
||||
{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi
|
||||
ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo
|
||||
___
|
||||
$code.=shift if (($i+$#_)==7);
|
||||
$code.="\t};;\n"
|
||||
}
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
prevsp=r3;
|
||||
|
||||
.align 32
|
||||
.skip 16 // aligns loop body
|
||||
.global gcm_ghash_4bit#
|
||||
.proc gcm_ghash_4bit#
|
||||
gcm_ghash_4bit:
|
||||
.prologue
|
||||
{ .mmi; .save ar.pfs,prevfs
|
||||
alloc prevfs=ar.pfs,4,2,0,0
|
||||
.vframe prevsp
|
||||
mov prevsp=sp
|
||||
mov $rem_8bit=ip };;
|
||||
.body
|
||||
{ .mfi; $ADDP r8=0+0,$Htbl
|
||||
$ADDP r9=0+8,$Htbl }
|
||||
{ .mfi; $ADDP r10=128+0,$Htbl
|
||||
$ADDP r11=128+8,$Htbl };;
|
||||
___
|
||||
&load_htable(
|
||||
" $ADDP $Xip=15,$Xip", # &Xi[15]
|
||||
" $ADDP $len=$len,$inp", # &inp[len]
|
||||
" $ADDP $inp=15,$inp", # &inp[15]
|
||||
" mov $mask0xff=0xff",
|
||||
" add sp=-512,sp",
|
||||
" andcm sp=sp,$mask0xff", # align stack frame
|
||||
" add r14=0,sp",
|
||||
" add r15=8,sp");
|
||||
$code.=<<___;
|
||||
{ .mmi; $sum 1<<1 // go big-endian
|
||||
add r8=256+0,sp
|
||||
add r9=256+8,sp }
|
||||
{ .mmi; add r10=256+128+0,sp
|
||||
add r11=256+128+8,sp
|
||||
add $len=-17,$len };;
|
||||
___
|
||||
for($i=0;$i<8;$i++) { # generate first half of Hshr4[]
|
||||
my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
|
||||
$code.=<<___;
|
||||
{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo
|
||||
st8 [r9]=$rhi,16 // Htable[$i].hi
|
||||
shrp $rlo=$rhi,$rlo,4 }//;;
|
||||
{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo
|
||||
stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi
|
||||
shr.u $rhi=$rhi,4 };;
|
||||
{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4
|
||||
st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
{ .mmi; ld8 r16=[r8],16 // Htable[8].lo
|
||||
ld8 r17=[r9],16 };; // Htable[8].hi
|
||||
{ .mmi; ld8 r18=[r8],16 // Htable[9].lo
|
||||
ld8 r19=[r9],16 } // Htable[9].hi
|
||||
{ .mmi; rum 1<<5 // clear um.mfh
|
||||
shrp r16=r17,r16,4 };;
|
||||
___
|
||||
for($i=0;$i<6;$i++) { # generate second half of Hshr4[]
|
||||
$code.=<<___;
|
||||
{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo
|
||||
ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi
|
||||
shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
|
||||
{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
|
||||
st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
|
||||
shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };;
|
||||
{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4
|
||||
st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4
|
||||
shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 }
|
||||
{ .mmi; add $Htbl=256,sp // &Htable[0]
|
||||
add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
|
||||
shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };;
|
||||
{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4
|
||||
st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4
|
||||
___
|
||||
|
||||
$in="r15";
|
||||
@xi=("r16","r17");
|
||||
@rem=("r18","r19");
|
||||
($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
|
||||
($Atbl,$Btbl)=("r26","r27");
|
||||
|
||||
$code.=<<___; # (p16)
|
||||
{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp--
|
||||
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
||||
cmp.eq p0,p6=r0,r0 };; // clear p6
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
|
||||
$code.=<<___; # (p16),(p17)
|
||||
{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
||||
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
||||
{ .mii; ld1 $in=[$inp],-1 //(p16) *inp--
|
||||
dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo
|
||||
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
||||
.align 32
|
||||
.LOOP:
|
||||
{ .mmi;
|
||||
(p6) st8 [$Xip]=$Zhi,13
|
||||
xor $Zlo=$Zlo,$Zlo
|
||||
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
|
||||
$code.=<<___; # (p16),(p17),(p18)
|
||||
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
||||
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
|
||||
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
||||
{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
||||
dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
|
||||
{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
|
||||
xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo
|
||||
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
|
||||
ld1 $in=[$inp],-1 } //(p16) *inp--
|
||||
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
|
||||
mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi
|
||||
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
||||
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
|
||||
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
||||
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
|
||||
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
||||
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
|
||||
for ($i=1;$i<14;$i++) {
|
||||
# Above and below fragments are derived from this one by removing
|
||||
# unsuitable (p??) instructions.
|
||||
$code.=<<___; # (p16),(p17),(p18),(p19)
|
||||
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
||||
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
|
||||
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
|
||||
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
||||
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
|
||||
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
||||
{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
||||
ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
||||
dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo
|
||||
{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
|
||||
xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
|
||||
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
|
||||
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
|
||||
ld1 $in=[$inp],-1 //(p16) *inp--
|
||||
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
|
||||
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
|
||||
xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
|
||||
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
||||
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
|
||||
ld1 $xi[0]=[$Xip],-1 //(p16) *Xi--
|
||||
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
|
||||
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
||||
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
|
||||
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
}
|
||||
|
||||
$code.=<<___; # (p17),(p18),(p19)
|
||||
{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
||||
ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
|
||||
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
|
||||
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
||||
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
|
||||
xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i]
|
||||
{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
||||
ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
||||
dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo
|
||||
{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4
|
||||
xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo
|
||||
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
|
||||
{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
|
||||
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
|
||||
{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4)
|
||||
xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi
|
||||
and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0
|
||||
{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi
|
||||
shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
|
||||
{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
||||
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
|
||||
add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi]
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
|
||||
$code.=<<___; # (p18),(p19)
|
||||
{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi
|
||||
shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8
|
||||
{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
||||
xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo
|
||||
{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi
|
||||
xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo
|
||||
{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
||||
xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi
|
||||
{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi
|
||||
shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48
|
||||
{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4
|
||||
xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi
|
||||
{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi
|
||||
shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
|
||||
{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
|
||||
xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48
|
||||
___
|
||||
push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
|
||||
|
||||
$code.=<<___; # (p19)
|
||||
{ .mmi; cmp.ltu p6,p0=$inp,$len
|
||||
add $inp=32,$inp
|
||||
shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4
|
||||
{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem]
|
||||
xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo
|
||||
add $Xip=9,$Xip };; // &Xi.lo
|
||||
{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem]
|
||||
(p6) ld1 $in=[$inp],-1 //[p16] *inp--
|
||||
(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14]
|
||||
{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi
|
||||
(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15]
|
||||
{ .mmi; st8 [$Xip]=$Zlo,-8
|
||||
(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i]
|
||||
shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48
|
||||
{ .mmi;
|
||||
(p6) ld1 $in=[$inp],-1 //[p16] *inp--
|
||||
xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48
|
||||
(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo
|
||||
{ .mib;
|
||||
(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0
|
||||
(p6) br.cond.dptk.many .LOOP };;
|
||||
|
||||
{ .mib; st8 [$Xip]=$Zhi };;
|
||||
{ .mib; $rum 1<<1 // return to little-endian
|
||||
.restore sp
|
||||
mov sp=prevsp
|
||||
br.ret.sptk.many b0 };;
|
||||
.endp gcm_ghash_4bit#
|
||||
___
|
||||
$code.=<<___;
|
||||
.align 128
|
||||
.type rem_4bit#,\@object
|
||||
rem_4bit:
|
||||
data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
|
||||
data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
|
||||
data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
|
||||
data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
|
||||
.size rem_4bit#,128
|
||||
.type rem_8bit#,\@object
|
||||
rem_8bit:
|
||||
data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
|
||||
data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
|
||||
data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
|
||||
data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
|
||||
data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
|
||||
data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
|
||||
data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
|
||||
data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
|
||||
data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
|
||||
data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
|
||||
data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
|
||||
data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
|
||||
data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
|
||||
data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
|
||||
data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
|
||||
data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
|
||||
data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
|
||||
data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
|
||||
data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
|
||||
data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
|
||||
data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
|
||||
data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
|
||||
data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
|
||||
data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
|
||||
data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
|
||||
data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
|
||||
data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
|
||||
data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
|
||||
data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
|
||||
data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
|
||||
data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
|
||||
data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
|
||||
.size rem_8bit#,512
|
||||
stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian);
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
730
crypto/modes/asm/ghash-parisc.pl
Normal file
730
crypto/modes/asm/ghash-parisc.pl
Normal file
|
@ -0,0 +1,730 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# April 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
|
||||
# it processes one byte in 19.6 cycles, which is more than twice as
|
||||
# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
|
||||
# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
|
||||
# processed byte. This is ~2.2x faster than 64-bit code generated by
|
||||
# vendor compiler (which used to be very hard to beat:-).
|
||||
#
|
||||
# Special thanks to polarhome.com for providing HP-UX account.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$LEVEL ="2.0W";
|
||||
$SIZE_T =8;
|
||||
$FRAME_MARKER =80;
|
||||
$SAVED_RP =16;
|
||||
$PUSH ="std";
|
||||
$PUSHMA ="std,ma";
|
||||
$POP ="ldd";
|
||||
$POPMB ="ldd,mb";
|
||||
$NREGS =6;
|
||||
} else {
|
||||
$LEVEL ="1.0"; #"\n\t.ALLOW\t2.0";
|
||||
$SIZE_T =4;
|
||||
$FRAME_MARKER =48;
|
||||
$SAVED_RP =20;
|
||||
$PUSH ="stw";
|
||||
$PUSHMA ="stwm";
|
||||
$POP ="ldw";
|
||||
$POPMB ="ldwm";
|
||||
$NREGS =11;
|
||||
}
|
||||
|
||||
$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
|
||||
# [+ argument transfer]
|
||||
|
||||
################# volatile registers
|
||||
$Xi="%r26"; # argument block
|
||||
$Htbl="%r25";
|
||||
$inp="%r24";
|
||||
$len="%r23";
|
||||
$Hhh=$Htbl; # variables
|
||||
$Hll="%r22";
|
||||
$Zhh="%r21";
|
||||
$Zll="%r20";
|
||||
$cnt="%r19";
|
||||
$rem_4bit="%r28";
|
||||
$rem="%r29";
|
||||
$mask0xf0="%r31";
|
||||
|
||||
################# preserved registers
|
||||
$Thh="%r1";
|
||||
$Tll="%r2";
|
||||
$nlo="%r3";
|
||||
$nhi="%r4";
|
||||
$byte="%r5";
|
||||
if ($SIZE_T==4) {
|
||||
$Zhl="%r6";
|
||||
$Zlh="%r7";
|
||||
$Hhl="%r8";
|
||||
$Hlh="%r9";
|
||||
$Thl="%r10";
|
||||
$Tlh="%r11";
|
||||
}
|
||||
$rem2="%r6"; # used in PA-RISC 2.0 code
|
||||
|
||||
$code.=<<___;
|
||||
.LEVEL $LEVEL
|
||||
.SPACE \$TEXT\$
|
||||
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
||||
|
||||
.EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
|
||||
.ALIGN 64
|
||||
gcm_gmult_4bit
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
||||
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
||||
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
||||
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
||||
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
blr %r0,$rem_4bit
|
||||
ldi 3,$rem
|
||||
L\$pic_gmult
|
||||
andcm $rem_4bit,$rem,$rem_4bit
|
||||
addl $inp,$len,$len
|
||||
ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
|
||||
ldi 0xf0,$mask0xf0
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
ldi 31,$rem
|
||||
mtctl $rem,%cr11
|
||||
extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
|
||||
b L\$parisc1_gmult
|
||||
nop
|
||||
___
|
||||
|
||||
$code.=<<___;
|
||||
ldb 15($Xi),$nlo
|
||||
ldo 8($Htbl),$Hll
|
||||
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
|
||||
ldd $nlo($Hll),$Zll
|
||||
ldd $nlo($Hhh),$Zhh
|
||||
|
||||
depd,z $Zll,60,4,$rem
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldb 14($Xi),$nlo
|
||||
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
b L\$oop_gmult_pa2
|
||||
ldi 13,$cnt
|
||||
|
||||
.ALIGN 8
|
||||
L\$oop_gmult_pa2
|
||||
xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
|
||||
depd,z $Zll,60,4,$rem
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nlo($Hll),$Tll
|
||||
ldd $nlo($Hhh),$Thh
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
depd,z $Zll,60,4,$rem
|
||||
ldbx $cnt($Xi),$nlo
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
ldd $rem($rem_4bit),$rem
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
addib,uv -1,$cnt,L\$oop_gmult_pa2
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
depd,z $Zll,60,4,$rem
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nlo($Hll),$Tll
|
||||
ldd $nlo($Hhh),$Thh
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
depd,z $Zll,60,4,$rem
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
std $Zll,8($Xi)
|
||||
std $Zhh,0($Xi)
|
||||
___
|
||||
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
b L\$done_gmult
|
||||
nop
|
||||
|
||||
L\$parisc1_gmult
|
||||
ldb 15($Xi),$nlo
|
||||
ldo 12($Htbl),$Hll
|
||||
ldo 8($Htbl),$Hlh
|
||||
ldo 4($Htbl),$Hhl
|
||||
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
zdep $nlo,27,4,$nlo
|
||||
|
||||
ldwx $nlo($Hll),$Zll
|
||||
ldwx $nlo($Hlh),$Zlh
|
||||
ldwx $nlo($Hhl),$Zhl
|
||||
ldwx $nlo($Hhh),$Zhh
|
||||
zdep $Zll,28,4,$rem
|
||||
ldb 14($Xi),$nlo
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
extru $Zhh,27,28,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
zdep $nlo,27,4,$nlo
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nlo($Hll),$Tll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nlo($Hlh),$Tlh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
b L\$oop_gmult_pa1
|
||||
ldi 13,$cnt
|
||||
|
||||
.ALIGN 8
|
||||
L\$oop_gmult_pa1
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $nlo($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nlo($Hhh),$Thh
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
ldbx $cnt($Xi),$nlo
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
xor $rem,$Zhh,$Zhh
|
||||
zdep $Zll,28,4,$rem
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
extru $Zhh,27,28,$Zhh
|
||||
zdep $nlo,27,4,$nlo
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nlo($Hll),$Tll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nlo($Hlh),$Tlh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
addib,uv -1,$cnt,L\$oop_gmult_pa1
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $nlo($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nlo($Hhh),$Thh
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
stw $Zll,12($Xi)
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
stw $Zlh,8($Xi)
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
stw $Zhl,4($Xi)
|
||||
stw $Zhh,0($Xi)
|
||||
___
|
||||
$code.=<<___;
|
||||
L\$done_gmult
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
||||
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
||||
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
||||
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
||||
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
|
||||
___
|
||||
$code.=<<___;
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
|
||||
.EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
|
||||
.ALIGN 64
|
||||
gcm_ghash_4bit
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
||||
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
||||
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
||||
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
||||
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
|
||||
___
|
||||
$code.=<<___;
|
||||
blr %r0,$rem_4bit
|
||||
ldi 3,$rem
|
||||
L\$pic_ghash
|
||||
andcm $rem_4bit,$rem,$rem_4bit
|
||||
addl $inp,$len,$len
|
||||
ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
|
||||
ldi 0xf0,$mask0xf0
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
ldi 31,$rem
|
||||
mtctl $rem,%cr11
|
||||
extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0
|
||||
b L\$parisc1_ghash
|
||||
nop
|
||||
___
|
||||
|
||||
$code.=<<___;
|
||||
ldb 15($Xi),$nlo
|
||||
ldo 8($Htbl),$Hll
|
||||
|
||||
L\$outer_ghash_pa2
|
||||
ldb 15($inp),$nhi
|
||||
xor $nhi,$nlo,$nlo
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
|
||||
ldd $nlo($Hll),$Zll
|
||||
ldd $nlo($Hhh),$Zhh
|
||||
|
||||
depd,z $Zll,60,4,$rem
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldb 14($Xi),$nlo
|
||||
ldb 14($inp),$byte
|
||||
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
xor $byte,$nlo,$nlo
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
b L\$oop_ghash_pa2
|
||||
ldi 13,$cnt
|
||||
|
||||
.ALIGN 8
|
||||
L\$oop_ghash_pa2
|
||||
xor $rem,$Zhh,$Zhh ; moved here to work around gas bug
|
||||
depd,z $Zll,60,4,$rem2
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nlo($Hll),$Tll
|
||||
ldd $nlo($Hhh),$Thh
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldbx $cnt($Xi),$nlo
|
||||
ldbx $cnt($inp),$byte
|
||||
|
||||
depd,z $Zll,60,4,$rem
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
ldd $rem2($rem_4bit),$rem2
|
||||
|
||||
xor $rem2,$Zhh,$Zhh
|
||||
xor $byte,$nlo,$nlo
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
depd,z $nlo,59,4,$nlo
|
||||
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
xor $Tll,$Zll,$Zll
|
||||
|
||||
ldd $rem($rem_4bit),$rem
|
||||
addib,uv -1,$cnt,L\$oop_ghash_pa2
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
depd,z $Zll,60,4,$rem2
|
||||
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
ldd $nlo($Hll),$Tll
|
||||
ldd $nlo($Hhh),$Thh
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
|
||||
depd,z $Zll,60,4,$rem
|
||||
shrpd $Zhh,$Zll,4,$Zll
|
||||
ldd $rem2($rem_4bit),$rem2
|
||||
|
||||
xor $rem2,$Zhh,$Zhh
|
||||
ldd $nhi($Hll),$Tll
|
||||
ldd $nhi($Hhh),$Thh
|
||||
|
||||
extrd,u $Zhh,59,60,$Zhh
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldd $rem($rem_4bit),$rem
|
||||
|
||||
xor $rem,$Zhh,$Zhh
|
||||
std $Zll,8($Xi)
|
||||
ldo 16($inp),$inp
|
||||
std $Zhh,0($Xi)
|
||||
cmpb,*<> $inp,$len,L\$outer_ghash_pa2
|
||||
copy $Zll,$nlo
|
||||
___
|
||||
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
b L\$done_ghash
|
||||
nop
|
||||
|
||||
L\$parisc1_ghash
|
||||
ldb 15($Xi),$nlo
|
||||
ldo 12($Htbl),$Hll
|
||||
ldo 8($Htbl),$Hlh
|
||||
ldo 4($Htbl),$Hhl
|
||||
|
||||
L\$outer_ghash_pa1
|
||||
ldb 15($inp),$byte
|
||||
xor $byte,$nlo,$nlo
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
zdep $nlo,27,4,$nlo
|
||||
|
||||
ldwx $nlo($Hll),$Zll
|
||||
ldwx $nlo($Hlh),$Zlh
|
||||
ldwx $nlo($Hhl),$Zhl
|
||||
ldwx $nlo($Hhh),$Zhh
|
||||
zdep $Zll,28,4,$rem
|
||||
ldb 14($Xi),$nlo
|
||||
ldb 14($inp),$byte
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
extru $Zhh,27,28,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
xor $byte,$nlo,$nlo
|
||||
xor $rem,$Zhh,$Zhh
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
zdep $nlo,27,4,$nlo
|
||||
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nlo($Hll),$Tll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nlo($Hlh),$Tlh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
b L\$oop_ghash_pa1
|
||||
ldi 13,$cnt
|
||||
|
||||
.ALIGN 8
|
||||
L\$oop_ghash_pa1
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $nlo($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nlo($Hhh),$Thh
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
ldbx $cnt($Xi),$nlo
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
ldbx $cnt($inp),$byte
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
xor $rem,$Zhh,$Zhh
|
||||
zdep $Zll,28,4,$rem
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
xor $byte,$nlo,$nlo
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
and $mask0xf0,$nlo,$nhi
|
||||
extru $Zhh,27,28,$Zhh
|
||||
zdep $nlo,27,4,$nlo
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nlo($Hll),$Tll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nlo($Hlh),$Tlh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
addib,uv -1,$cnt,L\$oop_ghash_pa1
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $nlo($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
ldwx $nlo($Hhh),$Thh
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
xor $Tll,$Zll,$Zll
|
||||
ldwx $nhi($Hll),$Tll
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
ldwx $nhi($Hlh),$Tlh
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
ldwx $nhi($Hhl),$Thl
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
ldwx $nhi($Hhh),$Thh
|
||||
zdep $Zll,28,4,$rem
|
||||
ldwx $rem($rem_4bit),$rem
|
||||
shrpw $Zlh,$Zll,4,$Zll
|
||||
shrpw $Zhl,$Zlh,4,$Zlh
|
||||
shrpw $Zhh,$Zhl,4,$Zhl
|
||||
extru $Zhh,27,28,$Zhh
|
||||
xor $Tll,$Zll,$Zll
|
||||
xor $Tlh,$Zlh,$Zlh
|
||||
xor $rem,$Zhh,$Zhh
|
||||
stw $Zll,12($Xi)
|
||||
xor $Thl,$Zhl,$Zhl
|
||||
stw $Zlh,8($Xi)
|
||||
xor $Thh,$Zhh,$Zhh
|
||||
stw $Zhl,4($Xi)
|
||||
ldo 16($inp),$inp
|
||||
stw $Zhh,0($Xi)
|
||||
comb,<> $inp,$len,L\$outer_ghash_pa1
|
||||
copy $Zll,$nlo
|
||||
___
|
||||
$code.=<<___;
|
||||
L\$done_ghash
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
||||
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
||||
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
||||
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
||||
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
|
||||
___
|
||||
$code.=<<___;
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
|
||||
.ALIGN 64
|
||||
L\$rem_4bit
|
||||
.WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
|
||||
.WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
|
||||
.WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
|
||||
.WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
|
||||
.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
|
||||
.ALIGN 64
|
||||
___
|
||||
|
||||
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
|
||||
# that it can be compiled with .LEVEL 1.0. It should be noted that I
|
||||
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
|
||||
# directive...
|
||||
|
||||
my $ldd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "ldd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
|
||||
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
|
||||
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
|
||||
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
|
||||
$opcode|=(1<<5) if ($mod =~ /^,m/);
|
||||
$opcode|=(1<<13) if ($mod =~ /^,mb/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $std = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "std$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
|
||||
{ my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $extrd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "extrd$mod\t$args";
|
||||
|
||||
# I only have ",u" completer, it's implicitly encoded...
|
||||
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
|
||||
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
|
||||
my $len=32-$3;
|
||||
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
|
||||
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
|
||||
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
|
||||
my $len=32-$2;
|
||||
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
|
||||
$opcode |= (1<<13) if ($mod =~ /,\**=/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $shrpd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "shrpd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
|
||||
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
|
||||
my $cpos=63-$3;
|
||||
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
|
||||
{ sprintf "\t.WORD\t0x%08x\t; %s",
|
||||
(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $depd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "depd$mod\t$args";
|
||||
|
||||
# I only have ",z" completer, it's impicitly encoded...
|
||||
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16
|
||||
{ my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
|
||||
my $cpos=63-$2;
|
||||
my $len=32-$3;
|
||||
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos
|
||||
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
sub assemble {
|
||||
my ($mnemonic,$mod,$args)=@_;
|
||||
my $opcode = eval("\$$mnemonic");
|
||||
|
||||
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
if ($SIZE_T==4) {
|
||||
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
|
||||
s/cmpb,\*/comb,/;
|
||||
s/,\*/,/;
|
||||
}
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT;
|
262
crypto/modes/asm/ghash-s390x.pl
Normal file
262
crypto/modes/asm/ghash-s390x.pl
Normal file
|
@ -0,0 +1,262 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# September 2010.
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+128 bytes shared table]. Performance
|
||||
# was measured to be ~18 cycles per processed byte on z10, which is
|
||||
# almost 40% better than gcc-generated code. It should be noted that
|
||||
# 18 cycles is worse result than expected: loop is scheduled for 12
|
||||
# and the result should be close to 12. In the lack of instruction-
|
||||
# level profiling data it's impossible to tell why...
|
||||
|
||||
# November 2010.
|
||||
#
|
||||
# Adapt for -m31 build. If kernel supports what's called "highgprs"
|
||||
# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
|
||||
# instructions and achieve "64-bit" performance even in 31-bit legacy
|
||||
# application context. The feature is not specific to any particular
|
||||
# processor, as long as it's "z-CPU". Latter implies that the code
|
||||
# remains z/Architecture specific. On z990 it was measured to perform
|
||||
# 2.8x better than 32-bit code generated by gcc 4.3.
|
||||
|
||||
# March 2011.
|
||||
#
|
||||
# Support for hardware KIMD-GHASH is verified to produce correct
|
||||
# result and therefore is engaged. On z196 it was measured to process
|
||||
# 8KB buffer ~7 faster than software implementation. It's not as
|
||||
# impressive for smaller buffer sizes and for smallest 16-bytes buffer
|
||||
# it's actually almost 2 times slower. Which is the reason why
|
||||
# KIMD-GHASH is not used in gcm_gmult_4bit.
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
if ($flavour =~ /3[12]/) {
|
||||
$SIZE_T=4;
|
||||
$g="";
|
||||
} else {
|
||||
$SIZE_T=8;
|
||||
$g="g";
|
||||
}
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$softonly=0;
|
||||
|
||||
$Zhi="%r0";
|
||||
$Zlo="%r1";
|
||||
|
||||
$Xi="%r2"; # argument block
|
||||
$Htbl="%r3";
|
||||
$inp="%r4";
|
||||
$len="%r5";
|
||||
|
||||
$rem0="%r6"; # variables
|
||||
$rem1="%r7";
|
||||
$nlo="%r8";
|
||||
$nhi="%r9";
|
||||
$xi="%r10";
|
||||
$cnt="%r11";
|
||||
$tmp="%r12";
|
||||
$x78="%r13";
|
||||
$rem_4bit="%r14";
|
||||
|
||||
$sp="%r15";
|
||||
|
||||
$code.=<<___;
|
||||
.text
|
||||
|
||||
.globl gcm_gmult_4bit
|
||||
.align 32
|
||||
gcm_gmult_4bit:
|
||||
___
|
||||
$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
|
||||
larl %r1,OPENSSL_s390xcap_P
|
||||
lg %r0,0(%r1)
|
||||
tmhl %r0,0x4000 # check for message-security-assist
|
||||
jz .Lsoft_gmult
|
||||
lghi %r0,0
|
||||
la %r1,16($sp)
|
||||
.long 0xb93e0004 # kimd %r0,%r4
|
||||
lg %r1,24($sp)
|
||||
tmhh %r1,0x4000 # check for function 65
|
||||
jz .Lsoft_gmult
|
||||
stg %r0,16($sp) # arrange 16 bytes of zero input
|
||||
stg %r0,24($sp)
|
||||
lghi %r0,65 # function 65
|
||||
la %r1,0($Xi) # H lies right after Xi in gcm128_context
|
||||
la $inp,16($sp)
|
||||
lghi $len,16
|
||||
.long 0xb93e0004 # kimd %r0,$inp
|
||||
brc 1,.-4 # pay attention to "partial completion"
|
||||
br %r14
|
||||
.align 32
|
||||
.Lsoft_gmult:
|
||||
___
|
||||
$code.=<<___;
|
||||
stm${g} %r6,%r14,6*$SIZE_T($sp)
|
||||
|
||||
aghi $Xi,-1
|
||||
lghi $len,1
|
||||
lghi $x78,`0xf<<3`
|
||||
larl $rem_4bit,rem_4bit
|
||||
|
||||
lg $Zlo,8+1($Xi) # Xi
|
||||
j .Lgmult_shortcut
|
||||
.type gcm_gmult_4bit,\@function
|
||||
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
|
||||
|
||||
.globl gcm_ghash_4bit
|
||||
.align 32
|
||||
gcm_ghash_4bit:
|
||||
___
|
||||
$code.=<<___ if(!$softonly);
|
||||
larl %r1,OPENSSL_s390xcap_P
|
||||
lg %r0,0(%r1)
|
||||
tmhl %r0,0x4000 # check for message-security-assist
|
||||
jz .Lsoft_ghash
|
||||
lghi %r0,0
|
||||
la %r1,16($sp)
|
||||
.long 0xb93e0004 # kimd %r0,%r4
|
||||
lg %r1,24($sp)
|
||||
tmhh %r1,0x4000 # check for function 65
|
||||
jz .Lsoft_ghash
|
||||
lghi %r0,65 # function 65
|
||||
la %r1,0($Xi) # H lies right after Xi in gcm128_context
|
||||
.long 0xb93e0004 # kimd %r0,$inp
|
||||
brc 1,.-4 # pay attention to "partial completion"
|
||||
br %r14
|
||||
.align 32
|
||||
.Lsoft_ghash:
|
||||
___
|
||||
$cdoe.=<<___ if ($flavour =~ /3[12]/);
|
||||
llgfr $len,$len
|
||||
___
|
||||
$code.=<<___;
|
||||
stm${g} %r6,%r14,6*$SIZE_T($sp)
|
||||
|
||||
aghi $Xi,-1
|
||||
srlg $len,$len,4
|
||||
lghi $x78,`0xf<<3`
|
||||
larl $rem_4bit,rem_4bit
|
||||
|
||||
lg $Zlo,8+1($Xi) # Xi
|
||||
lg $Zhi,0+1($Xi)
|
||||
lghi $tmp,0
|
||||
.Louter:
|
||||
xg $Zhi,0($inp) # Xi ^= inp
|
||||
xg $Zlo,8($inp)
|
||||
xgr $Zhi,$tmp
|
||||
stg $Zlo,8+1($Xi)
|
||||
stg $Zhi,0+1($Xi)
|
||||
|
||||
.Lgmult_shortcut:
|
||||
lghi $tmp,0xf0
|
||||
sllg $nlo,$Zlo,4
|
||||
srlg $xi,$Zlo,8 # extract second byte
|
||||
ngr $nlo,$tmp
|
||||
lgr $nhi,$Zlo
|
||||
lghi $cnt,14
|
||||
ngr $nhi,$tmp
|
||||
|
||||
lg $Zlo,8($nlo,$Htbl)
|
||||
lg $Zhi,0($nlo,$Htbl)
|
||||
|
||||
sllg $nlo,$xi,4
|
||||
sllg $rem0,$Zlo,3
|
||||
ngr $nlo,$tmp
|
||||
ngr $rem0,$x78
|
||||
ngr $xi,$tmp
|
||||
|
||||
sllg $tmp,$Zhi,60
|
||||
srlg $Zlo,$Zlo,4
|
||||
srlg $Zhi,$Zhi,4
|
||||
xg $Zlo,8($nhi,$Htbl)
|
||||
xg $Zhi,0($nhi,$Htbl)
|
||||
lgr $nhi,$xi
|
||||
sllg $rem1,$Zlo,3
|
||||
xgr $Zlo,$tmp
|
||||
ngr $rem1,$x78
|
||||
j .Lghash_inner
|
||||
.align 16
|
||||
.Lghash_inner:
|
||||
srlg $Zlo,$Zlo,4
|
||||
sllg $tmp,$Zhi,60
|
||||
xg $Zlo,8($nlo,$Htbl)
|
||||
srlg $Zhi,$Zhi,4
|
||||
llgc $xi,0($cnt,$Xi)
|
||||
xg $Zhi,0($nlo,$Htbl)
|
||||
sllg $nlo,$xi,4
|
||||
xg $Zhi,0($rem0,$rem_4bit)
|
||||
nill $nlo,0xf0
|
||||
sllg $rem0,$Zlo,3
|
||||
xgr $Zlo,$tmp
|
||||
ngr $rem0,$x78
|
||||
nill $xi,0xf0
|
||||
|
||||
sllg $tmp,$Zhi,60
|
||||
srlg $Zlo,$Zlo,4
|
||||
srlg $Zhi,$Zhi,4
|
||||
xg $Zlo,8($nhi,$Htbl)
|
||||
xg $Zhi,0($nhi,$Htbl)
|
||||
lgr $nhi,$xi
|
||||
xg $Zhi,0($rem1,$rem_4bit)
|
||||
sllg $rem1,$Zlo,3
|
||||
xgr $Zlo,$tmp
|
||||
ngr $rem1,$x78
|
||||
brct $cnt,.Lghash_inner
|
||||
|
||||
sllg $tmp,$Zhi,60
|
||||
srlg $Zlo,$Zlo,4
|
||||
srlg $Zhi,$Zhi,4
|
||||
xg $Zlo,8($nlo,$Htbl)
|
||||
xg $Zhi,0($nlo,$Htbl)
|
||||
sllg $xi,$Zlo,3
|
||||
xg $Zhi,0($rem0,$rem_4bit)
|
||||
xgr $Zlo,$tmp
|
||||
ngr $xi,$x78
|
||||
|
||||
sllg $tmp,$Zhi,60
|
||||
srlg $Zlo,$Zlo,4
|
||||
srlg $Zhi,$Zhi,4
|
||||
xg $Zlo,8($nhi,$Htbl)
|
||||
xg $Zhi,0($nhi,$Htbl)
|
||||
xgr $Zlo,$tmp
|
||||
xg $Zhi,0($rem1,$rem_4bit)
|
||||
|
||||
lg $tmp,0($xi,$rem_4bit)
|
||||
la $inp,16($inp)
|
||||
sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
|
||||
brctg $len,.Louter
|
||||
|
||||
xgr $Zhi,$tmp
|
||||
stg $Zlo,8+1($Xi)
|
||||
stg $Zhi,0+1($Xi)
|
||||
lm${g} %r6,%r14,6*$SIZE_T($sp)
|
||||
br %r14
|
||||
.type gcm_ghash_4bit,\@function
|
||||
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
|
||||
|
||||
.align 64
|
||||
rem_4bit:
|
||||
.long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
|
||||
.long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
|
||||
.long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
|
||||
.long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
|
||||
.type rem_4bit,\@object
|
||||
.size rem_4bit,(.-rem_4bit)
|
||||
.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
330
crypto/modes/asm/ghash-sparcv9.pl
Normal file
330
crypto/modes/asm/ghash-sparcv9.pl
Normal file
|
@ -0,0 +1,330 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# March 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that it
|
||||
# uses 256 bytes per-key table [+128 bytes shared table]. Performance
|
||||
# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
|
||||
# and are expressed in cycles per processed byte, less is better:
|
||||
#
|
||||
# gcc 3.3.x cc 5.2 this assembler
|
||||
#
|
||||
# 32-bit build 81.4 43.3 12.6 (+546%/+244%)
|
||||
# 64-bit build 20.2 21.2 12.6 (+60%/+68%)
|
||||
#
|
||||
# Here is data collected on UltraSPARC T1 system running Linux:
|
||||
#
|
||||
# gcc 4.4.1 this assembler
|
||||
#
|
||||
# 32-bit build 566 50 (+1000%)
|
||||
# 64-bit build 56 50 (+12%)
|
||||
#
|
||||
# I don't quite understand why difference between 32-bit and 64-bit
|
||||
# compiler-generated code is so big. Compilers *were* instructed to
|
||||
# generate code for UltraSPARC and should have used 64-bit registers
|
||||
# for Z vector (see C code) even in 32-bit build... Oh well, it only
|
||||
# means more impressive improvement coefficients for this assembler
|
||||
# module;-) Loops are aggressively modulo-scheduled in respect to
|
||||
# references to input data and Z.hi updates to achieve 12 cycles
|
||||
# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
|
||||
# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
|
||||
|
||||
$bits=32;
|
||||
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
|
||||
if ($bits==64) { $bias=2047; $frame=192; }
|
||||
else { $bias=0; $frame=112; }
|
||||
|
||||
$output=shift;
|
||||
open STDOUT,">$output";
|
||||
|
||||
$Zhi="%o0"; # 64-bit values
|
||||
$Zlo="%o1";
|
||||
$Thi="%o2";
|
||||
$Tlo="%o3";
|
||||
$rem="%o4";
|
||||
$tmp="%o5";
|
||||
|
||||
$nhi="%l0"; # small values and pointers
|
||||
$nlo="%l1";
|
||||
$xi0="%l2";
|
||||
$xi1="%l3";
|
||||
$rem_4bit="%l4";
|
||||
$remi="%l5";
|
||||
$Htblo="%l6";
|
||||
$cnt="%l7";
|
||||
|
||||
$Xi="%i0"; # input argument block
|
||||
$Htbl="%i1";
|
||||
$inp="%i2";
|
||||
$len="%i3";
|
||||
|
||||
$code.=<<___;
|
||||
.section ".text",#alloc,#execinstr
|
||||
|
||||
.align 64
|
||||
rem_4bit:
|
||||
.long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
|
||||
.long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
|
||||
.long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
|
||||
.long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
|
||||
.type rem_4bit,#object
|
||||
.size rem_4bit,(.-rem_4bit)
|
||||
|
||||
.globl gcm_ghash_4bit
|
||||
.align 32
|
||||
gcm_ghash_4bit:
|
||||
save %sp,-$frame,%sp
|
||||
ldub [$inp+15],$nlo
|
||||
ldub [$Xi+15],$xi0
|
||||
ldub [$Xi+14],$xi1
|
||||
add $len,$inp,$len
|
||||
add $Htbl,8,$Htblo
|
||||
|
||||
1: call .+8
|
||||
add %o7,rem_4bit-1b,$rem_4bit
|
||||
|
||||
.Louter:
|
||||
xor $xi0,$nlo,$nlo
|
||||
and $nlo,0xf0,$nhi
|
||||
and $nlo,0x0f,$nlo
|
||||
sll $nlo,4,$nlo
|
||||
ldx [$Htblo+$nlo],$Zlo
|
||||
ldx [$Htbl+$nlo],$Zhi
|
||||
|
||||
ldub [$inp+14],$nlo
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
and $Zlo,0xf,$remi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
sll $remi,3,$remi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
srlx $Zlo,4,$Zlo
|
||||
mov 13,$cnt
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
|
||||
xor $xi1,$nlo,$nlo
|
||||
and $Zlo,0xf,$remi
|
||||
and $nlo,0xf0,$nhi
|
||||
and $nlo,0x0f,$nlo
|
||||
ba .Lghash_inner
|
||||
sll $nlo,4,$nlo
|
||||
.align 32
|
||||
.Lghash_inner:
|
||||
ldx [$Htblo+$nlo],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nlo],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
ldub [$inp+$cnt],$nlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
ldub [$Xi+$cnt],$xi1
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $xi1,$nlo,$nlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
and $nlo,0xf0,$nhi
|
||||
addcc $cnt,-1,$cnt
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
and $nlo,0x0f,$nlo
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
blu .Lghash_inner
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nlo],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nlo],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
|
||||
add $inp,16,$inp
|
||||
cmp $inp,$len
|
||||
be,pn `$bits==64?"%xcc":"%icc"`,.Ldone
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
ldub [$inp+15],$nlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
stx $Zlo,[$Xi+8]
|
||||
xor $rem,$Zhi,$Zhi
|
||||
stx $Zhi,[$Xi]
|
||||
srl $Zlo,8,$xi1
|
||||
and $Zlo,0xff,$xi0
|
||||
ba .Louter
|
||||
and $xi1,0xff,$xi1
|
||||
.align 32
|
||||
.Ldone:
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
stx $Zlo,[$Xi+8]
|
||||
xor $rem,$Zhi,$Zhi
|
||||
stx $Zhi,[$Xi]
|
||||
|
||||
ret
|
||||
restore
|
||||
.type gcm_ghash_4bit,#function
|
||||
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
|
||||
___
|
||||
|
||||
undef $inp;
|
||||
undef $len;
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_gmult_4bit
|
||||
.align 32
|
||||
gcm_gmult_4bit:
|
||||
save %sp,-$frame,%sp
|
||||
ldub [$Xi+15],$nlo
|
||||
add $Htbl,8,$Htblo
|
||||
|
||||
1: call .+8
|
||||
add %o7,rem_4bit-1b,$rem_4bit
|
||||
|
||||
and $nlo,0xf0,$nhi
|
||||
and $nlo,0x0f,$nlo
|
||||
sll $nlo,4,$nlo
|
||||
ldx [$Htblo+$nlo],$Zlo
|
||||
ldx [$Htbl+$nlo],$Zhi
|
||||
|
||||
ldub [$Xi+14],$nlo
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
and $Zlo,0xf,$remi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
sll $remi,3,$remi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
srlx $Zlo,4,$Zlo
|
||||
mov 13,$cnt
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
|
||||
and $Zlo,0xf,$remi
|
||||
and $nlo,0xf0,$nhi
|
||||
and $nlo,0x0f,$nlo
|
||||
ba .Lgmult_inner
|
||||
sll $nlo,4,$nlo
|
||||
.align 32
|
||||
.Lgmult_inner:
|
||||
ldx [$Htblo+$nlo],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nlo],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
ldub [$Xi+$cnt],$nlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
srlx $Zhi,4,$Zhi
|
||||
and $nlo,0xf0,$nhi
|
||||
addcc $cnt,-1,$cnt
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
and $nlo,0x0f,$nlo
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
sll $nlo,4,$nlo
|
||||
blu .Lgmult_inner
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nlo],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nlo],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
and $Zlo,0xf,$remi
|
||||
|
||||
ldx [$Htblo+$nhi],$Tlo
|
||||
sll $remi,3,$remi
|
||||
xor $rem,$Zhi,$Zhi
|
||||
ldx [$Htbl+$nhi],$Thi
|
||||
srlx $Zlo,4,$Zlo
|
||||
ldx [$rem_4bit+$remi],$rem
|
||||
sllx $Zhi,60,$tmp
|
||||
xor $Tlo,$Zlo,$Zlo
|
||||
srlx $Zhi,4,$Zhi
|
||||
xor $Zlo,$tmp,$Zlo
|
||||
xor $Thi,$Zhi,$Zhi
|
||||
stx $Zlo,[$Xi+8]
|
||||
xor $rem,$Zhi,$Zhi
|
||||
stx $Zhi,[$Xi]
|
||||
|
||||
ret
|
||||
restore
|
||||
.type gcm_gmult_4bit,#function
|
||||
.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
|
||||
.asciz "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 4
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
print $code;
|
||||
close STDOUT;
|
1342
crypto/modes/asm/ghash-x86.pl
Normal file
1342
crypto/modes/asm/ghash-x86.pl
Normal file
File diff suppressed because it is too large
Load diff
805
crypto/modes/asm/ghash-x86_64.pl
Normal file
805
crypto/modes/asm/ghash-x86_64.pl
Normal file
|
@ -0,0 +1,805 @@
|
|||
#!/usr/bin/env perl
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
#
|
||||
# March, June 2010
|
||||
#
|
||||
# The module implements "4-bit" GCM GHASH function and underlying
|
||||
# single multiplication operation in GF(2^128). "4-bit" means that
|
||||
# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
|
||||
# function features so called "528B" variant utilizing additional
|
||||
# 256+16 bytes of per-key storage [+512 bytes shared table].
|
||||
# Performance results are for this streamed GHASH subroutine and are
|
||||
# expressed in cycles per processed byte, less is better:
|
||||
#
|
||||
# gcc 3.4.x(*) assembler
|
||||
#
|
||||
# P4 28.6 14.0 +100%
|
||||
# Opteron 19.3 7.7 +150%
|
||||
# Core2 17.8 8.1(**) +120%
|
||||
#
|
||||
# (*) comparison is not completely fair, because C results are
|
||||
# for vanilla "256B" implementation, while assembler results
|
||||
# are for "528B";-)
|
||||
# (**) it's mystery [to me] why Core2 result is not same as for
|
||||
# Opteron;
|
||||
|
||||
# May 2010
|
||||
#
|
||||
# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
|
||||
# See ghash-x86.pl for background information and details about coding
|
||||
# techniques.
|
||||
#
|
||||
# Special thanks to David Woodhouse <dwmw2@infradead.org> for
|
||||
# providing access to a Westmere-based system on behalf of Intel
|
||||
# Open Source Technology Centre.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
|
||||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||||
die "can't locate x86_64-xlate.pl";
|
||||
|
||||
open STDOUT,"| $^X $xlate $flavour $output";
|
||||
|
||||
# common register layout
|
||||
$nlo="%rax";
|
||||
$nhi="%rbx";
|
||||
$Zlo="%r8";
|
||||
$Zhi="%r9";
|
||||
$tmp="%r10";
|
||||
$rem_4bit = "%r11";
|
||||
|
||||
$Xi="%rdi";
|
||||
$Htbl="%rsi";
|
||||
|
||||
# per-function register layout
|
||||
$cnt="%rcx";
|
||||
$rem="%rdx";
|
||||
|
||||
sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
|
||||
$r =~ s/%[er]([sd]i)/%\1l/ or
|
||||
$r =~ s/%[er](bp)/%\1l/ or
|
||||
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
|
||||
|
||||
sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
||||
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
||||
my $arg = pop;
|
||||
$arg = "\$$arg" if ($arg*1 eq $arg);
|
||||
$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
||||
}
|
||||
|
||||
{ my $N;
|
||||
sub loop() {
|
||||
my $inp = shift;
|
||||
|
||||
$N++;
|
||||
$code.=<<___;
|
||||
xor $nlo,$nlo
|
||||
xor $nhi,$nhi
|
||||
mov `&LB("$Zlo")`,`&LB("$nlo")`
|
||||
mov `&LB("$Zlo")`,`&LB("$nhi")`
|
||||
shl \$4,`&LB("$nlo")`
|
||||
mov \$14,$cnt
|
||||
mov 8($Htbl,$nlo),$Zlo
|
||||
mov ($Htbl,$nlo),$Zhi
|
||||
and \$0xf0,`&LB("$nhi")`
|
||||
mov $Zlo,$rem
|
||||
jmp .Loop$N
|
||||
|
||||
.align 16
|
||||
.Loop$N:
|
||||
shr \$4,$Zlo
|
||||
and \$0xf,$rem
|
||||
mov $Zhi,$tmp
|
||||
mov ($inp,$cnt),`&LB("$nlo")`
|
||||
shr \$4,$Zhi
|
||||
xor 8($Htbl,$nhi),$Zlo
|
||||
shl \$60,$tmp
|
||||
xor ($Htbl,$nhi),$Zhi
|
||||
mov `&LB("$nlo")`,`&LB("$nhi")`
|
||||
xor ($rem_4bit,$rem,8),$Zhi
|
||||
mov $Zlo,$rem
|
||||
shl \$4,`&LB("$nlo")`
|
||||
xor $tmp,$Zlo
|
||||
dec $cnt
|
||||
js .Lbreak$N
|
||||
|
||||
shr \$4,$Zlo
|
||||
and \$0xf,$rem
|
||||
mov $Zhi,$tmp
|
||||
shr \$4,$Zhi
|
||||
xor 8($Htbl,$nlo),$Zlo
|
||||
shl \$60,$tmp
|
||||
xor ($Htbl,$nlo),$Zhi
|
||||
and \$0xf0,`&LB("$nhi")`
|
||||
xor ($rem_4bit,$rem,8),$Zhi
|
||||
mov $Zlo,$rem
|
||||
xor $tmp,$Zlo
|
||||
jmp .Loop$N
|
||||
|
||||
.align 16
|
||||
.Lbreak$N:
|
||||
shr \$4,$Zlo
|
||||
and \$0xf,$rem
|
||||
mov $Zhi,$tmp
|
||||
shr \$4,$Zhi
|
||||
xor 8($Htbl,$nlo),$Zlo
|
||||
shl \$60,$tmp
|
||||
xor ($Htbl,$nlo),$Zhi
|
||||
and \$0xf0,`&LB("$nhi")`
|
||||
xor ($rem_4bit,$rem,8),$Zhi
|
||||
mov $Zlo,$rem
|
||||
xor $tmp,$Zlo
|
||||
|
||||
shr \$4,$Zlo
|
||||
and \$0xf,$rem
|
||||
mov $Zhi,$tmp
|
||||
shr \$4,$Zhi
|
||||
xor 8($Htbl,$nhi),$Zlo
|
||||
shl \$60,$tmp
|
||||
xor ($Htbl,$nhi),$Zhi
|
||||
xor $tmp,$Zlo
|
||||
xor ($rem_4bit,$rem,8),$Zhi
|
||||
|
||||
bswap $Zlo
|
||||
bswap $Zhi
|
||||
___
|
||||
}}
|
||||
|
||||
$code=<<___;
|
||||
.text
|
||||
|
||||
.globl gcm_gmult_4bit
|
||||
.type gcm_gmult_4bit,\@function,2
|
||||
.align 16
|
||||
gcm_gmult_4bit:
|
||||
push %rbx
|
||||
push %rbp # %rbp and %r12 are pushed exclusively in
|
||||
push %r12 # order to reuse Win64 exception handler...
|
||||
.Lgmult_prologue:
|
||||
|
||||
movzb 15($Xi),$Zlo
|
||||
lea .Lrem_4bit(%rip),$rem_4bit
|
||||
___
|
||||
&loop ($Xi);
|
||||
$code.=<<___;
|
||||
mov $Zlo,8($Xi)
|
||||
mov $Zhi,($Xi)
|
||||
|
||||
mov 16(%rsp),%rbx
|
||||
lea 24(%rsp),%rsp
|
||||
.Lgmult_epilogue:
|
||||
ret
|
||||
.size gcm_gmult_4bit,.-gcm_gmult_4bit
|
||||
___
|
||||
|
||||
# per-function register layout
|
||||
$inp="%rdx";
|
||||
$len="%rcx";
|
||||
$rem_8bit=$rem_4bit;
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_ghash_4bit
|
||||
.type gcm_ghash_4bit,\@function,4
|
||||
.align 16
|
||||
gcm_ghash_4bit:
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
sub \$280,%rsp
|
||||
.Lghash_prologue:
|
||||
mov $inp,%r14 # reassign couple of args
|
||||
mov $len,%r15
|
||||
___
|
||||
{ my $inp="%r14";
|
||||
my $dat="%edx";
|
||||
my $len="%r15";
|
||||
my @nhi=("%ebx","%ecx");
|
||||
my @rem=("%r12","%r13");
|
||||
my $Hshr4="%rbp";
|
||||
|
||||
&sub ($Htbl,-128); # size optimization
|
||||
&lea ($Hshr4,"16+128(%rsp)");
|
||||
{ my @lo =($nlo,$nhi);
|
||||
my @hi =($Zlo,$Zhi);
|
||||
|
||||
&xor ($dat,$dat);
|
||||
for ($i=0,$j=-2;$i<18;$i++,$j++) {
|
||||
&mov ("$j(%rsp)",&LB($dat)) if ($i>1);
|
||||
&or ($lo[0],$tmp) if ($i>1);
|
||||
&mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
|
||||
&shr ($lo[1],4) if ($i>0 && $i<17);
|
||||
&mov ($tmp,$hi[1]) if ($i>0 && $i<17);
|
||||
&shr ($hi[1],4) if ($i>0 && $i<17);
|
||||
&mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
|
||||
&mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
|
||||
&shl (&LB($dat),4) if ($i>0 && $i<17);
|
||||
&mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
|
||||
&mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
|
||||
&shl ($tmp,60) if ($i>0 && $i<17);
|
||||
|
||||
push (@lo,shift(@lo));
|
||||
push (@hi,shift(@hi));
|
||||
}
|
||||
}
|
||||
&add ($Htbl,-128);
|
||||
&mov ($Zlo,"8($Xi)");
|
||||
&mov ($Zhi,"0($Xi)");
|
||||
&add ($len,$inp); # pointer to the end of data
|
||||
&lea ($rem_8bit,".Lrem_8bit(%rip)");
|
||||
&jmp (".Louter_loop");
|
||||
|
||||
$code.=".align 16\n.Louter_loop:\n";
|
||||
&xor ($Zhi,"($inp)");
|
||||
&mov ("%rdx","8($inp)");
|
||||
&lea ($inp,"16($inp)");
|
||||
&xor ("%rdx",$Zlo);
|
||||
&mov ("($Xi)",$Zhi);
|
||||
&mov ("8($Xi)","%rdx");
|
||||
&shr ("%rdx",32);
|
||||
|
||||
&xor ($nlo,$nlo);
|
||||
&rol ($dat,8);
|
||||
&mov (&LB($nlo),&LB($dat));
|
||||
&movz ($nhi[0],&LB($dat));
|
||||
&shl (&LB($nlo),4);
|
||||
&shr ($nhi[0],4);
|
||||
|
||||
for ($j=11,$i=0;$i<15;$i++) {
|
||||
&rol ($dat,8);
|
||||
&xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
|
||||
&xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
|
||||
&mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
|
||||
&mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
|
||||
|
||||
&mov (&LB($nlo),&LB($dat));
|
||||
&xor ($Zlo,$tmp) if ($i>0);
|
||||
&movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
|
||||
|
||||
&movz ($nhi[1],&LB($dat));
|
||||
&shl (&LB($nlo),4);
|
||||
&movzb ($rem[0],"(%rsp,$nhi[0])");
|
||||
|
||||
&shr ($nhi[1],4) if ($i<14);
|
||||
&and ($nhi[1],0xf0) if ($i==14);
|
||||
&shl ($rem[1],48) if ($i>0);
|
||||
&xor ($rem[0],$Zlo);
|
||||
|
||||
&mov ($tmp,$Zhi);
|
||||
&xor ($Zhi,$rem[1]) if ($i>0);
|
||||
&shr ($Zlo,8);
|
||||
|
||||
&movz ($rem[0],&LB($rem[0]));
|
||||
&mov ($dat,"$j($Xi)") if (--$j%4==0);
|
||||
&shr ($Zhi,8);
|
||||
|
||||
&xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
|
||||
&shl ($tmp,56);
|
||||
&xor ($Zhi,"($Hshr4,$nhi[0],8)");
|
||||
|
||||
unshift (@nhi,pop(@nhi)); # "rotate" registers
|
||||
unshift (@rem,pop(@rem));
|
||||
}
|
||||
&movzw ($rem[1],"($rem_8bit,$rem[1],2)");
|
||||
&xor ($Zlo,"8($Htbl,$nlo)");
|
||||
&xor ($Zhi,"($Htbl,$nlo)");
|
||||
|
||||
&shl ($rem[1],48);
|
||||
&xor ($Zlo,$tmp);
|
||||
|
||||
&xor ($Zhi,$rem[1]);
|
||||
&movz ($rem[0],&LB($Zlo));
|
||||
&shr ($Zlo,4);
|
||||
|
||||
&mov ($tmp,$Zhi);
|
||||
&shl (&LB($rem[0]),4);
|
||||
&shr ($Zhi,4);
|
||||
|
||||
&xor ($Zlo,"8($Htbl,$nhi[0])");
|
||||
&movzw ($rem[0],"($rem_8bit,$rem[0],2)");
|
||||
&shl ($tmp,60);
|
||||
|
||||
&xor ($Zhi,"($Htbl,$nhi[0])");
|
||||
&xor ($Zlo,$tmp);
|
||||
&shl ($rem[0],48);
|
||||
|
||||
&bswap ($Zlo);
|
||||
&xor ($Zhi,$rem[0]);
|
||||
|
||||
&bswap ($Zhi);
|
||||
&cmp ($inp,$len);
|
||||
&jb (".Louter_loop");
|
||||
}
|
||||
$code.=<<___;
|
||||
mov $Zlo,8($Xi)
|
||||
mov $Zhi,($Xi)
|
||||
|
||||
lea 280(%rsp),%rsi
|
||||
mov 0(%rsi),%r15
|
||||
mov 8(%rsi),%r14
|
||||
mov 16(%rsi),%r13
|
||||
mov 24(%rsi),%r12
|
||||
mov 32(%rsi),%rbp
|
||||
mov 40(%rsi),%rbx
|
||||
lea 48(%rsi),%rsp
|
||||
.Lghash_epilogue:
|
||||
ret
|
||||
.size gcm_ghash_4bit,.-gcm_ghash_4bit
|
||||
___
|
||||
|
||||
######################################################################
|
||||
# PCLMULQDQ version.
|
||||
|
||||
@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
|
||||
("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
||||
|
||||
($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
|
||||
($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
|
||||
|
||||
sub clmul64x64_T2 { # minimal register pressure
|
||||
my ($Xhi,$Xi,$Hkey,$modulo)=@_;
|
||||
|
||||
$code.=<<___ if (!defined($modulo));
|
||||
movdqa $Xi,$Xhi #
|
||||
pshufd \$0b01001110,$Xi,$T1
|
||||
pshufd \$0b01001110,$Hkey,$T2
|
||||
pxor $Xi,$T1 #
|
||||
pxor $Hkey,$T2
|
||||
___
|
||||
$code.=<<___;
|
||||
pclmulqdq \$0x00,$Hkey,$Xi #######
|
||||
pclmulqdq \$0x11,$Hkey,$Xhi #######
|
||||
pclmulqdq \$0x00,$T2,$T1 #######
|
||||
pxor $Xi,$T1 #
|
||||
pxor $Xhi,$T1 #
|
||||
|
||||
movdqa $T1,$T2 #
|
||||
psrldq \$8,$T1
|
||||
pslldq \$8,$T2 #
|
||||
pxor $T1,$Xhi
|
||||
pxor $T2,$Xi #
|
||||
___
|
||||
}
|
||||
|
||||
sub reduction_alg9 { # 17/13 times faster than Intel version
|
||||
my ($Xhi,$Xi) = @_;
|
||||
|
||||
$code.=<<___;
|
||||
# 1st phase
|
||||
movdqa $Xi,$T1 #
|
||||
psllq \$1,$Xi
|
||||
pxor $T1,$Xi #
|
||||
psllq \$5,$Xi #
|
||||
pxor $T1,$Xi #
|
||||
psllq \$57,$Xi #
|
||||
movdqa $Xi,$T2 #
|
||||
pslldq \$8,$Xi
|
||||
psrldq \$8,$T2 #
|
||||
pxor $T1,$Xi
|
||||
pxor $T2,$Xhi #
|
||||
|
||||
# 2nd phase
|
||||
movdqa $Xi,$T2
|
||||
psrlq \$5,$Xi
|
||||
pxor $T2,$Xi #
|
||||
psrlq \$1,$Xi #
|
||||
pxor $T2,$Xi #
|
||||
pxor $Xhi,$T2
|
||||
psrlq \$1,$Xi #
|
||||
pxor $T2,$Xi #
|
||||
___
|
||||
}
|
||||
|
||||
{ my ($Htbl,$Xip)=@_4args;
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_init_clmul
|
||||
.type gcm_init_clmul,\@abi-omnipotent
|
||||
.align 16
|
||||
gcm_init_clmul:
|
||||
movdqu ($Xip),$Hkey
|
||||
pshufd \$0b01001110,$Hkey,$Hkey # dword swap
|
||||
|
||||
# <<1 twist
|
||||
pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
|
||||
movdqa $Hkey,$T1
|
||||
psllq \$1,$Hkey
|
||||
pxor $T3,$T3 #
|
||||
psrlq \$63,$T1
|
||||
pcmpgtd $T2,$T3 # broadcast carry bit
|
||||
pslldq \$8,$T1
|
||||
por $T1,$Hkey # H<<=1
|
||||
|
||||
# magic reduction
|
||||
pand .L0x1c2_polynomial(%rip),$T3
|
||||
pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
|
||||
|
||||
# calculate H^2
|
||||
movdqa $Hkey,$Xi
|
||||
___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
$code.=<<___;
|
||||
movdqu $Hkey,($Htbl) # save H
|
||||
movdqu $Xi,16($Htbl) # save H^2
|
||||
ret
|
||||
.size gcm_init_clmul,.-gcm_init_clmul
|
||||
___
|
||||
}
|
||||
|
||||
{ my ($Xip,$Htbl)=@_4args;
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_gmult_clmul
|
||||
.type gcm_gmult_clmul,\@abi-omnipotent
|
||||
.align 16
|
||||
gcm_gmult_clmul:
|
||||
movdqu ($Xip),$Xi
|
||||
movdqa .Lbswap_mask(%rip),$T3
|
||||
movdqu ($Htbl),$Hkey
|
||||
pshufb $T3,$Xi
|
||||
___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
$code.=<<___;
|
||||
pshufb $T3,$Xi
|
||||
movdqu $Xi,($Xip)
|
||||
ret
|
||||
.size gcm_gmult_clmul,.-gcm_gmult_clmul
|
||||
___
|
||||
}
|
||||
|
||||
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
|
||||
my $Xn="%xmm6";
|
||||
my $Xhn="%xmm7";
|
||||
my $Hkey2="%xmm8";
|
||||
my $T1n="%xmm9";
|
||||
my $T2n="%xmm10";
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_ghash_clmul
|
||||
.type gcm_ghash_clmul,\@abi-omnipotent
|
||||
.align 16
|
||||
gcm_ghash_clmul:
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
.LSEH_begin_gcm_ghash_clmul:
|
||||
# I can't trust assembler to use specific encoding:-(
|
||||
.byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
|
||||
.byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
|
||||
.byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
|
||||
___
|
||||
$code.=<<___;
|
||||
movdqa .Lbswap_mask(%rip),$T3
|
||||
|
||||
movdqu ($Xip),$Xi
|
||||
movdqu ($Htbl),$Hkey
|
||||
pshufb $T3,$Xi
|
||||
|
||||
sub \$0x10,$len
|
||||
jz .Lodd_tail
|
||||
|
||||
movdqu 16($Htbl),$Hkey2
|
||||
#######
|
||||
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
||||
# [(H*Ii+1) + (H*Xi+1)] mod P =
|
||||
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
||||
#
|
||||
movdqu ($inp),$T1 # Ii
|
||||
movdqu 16($inp),$Xn # Ii+1
|
||||
pshufb $T3,$T1
|
||||
pshufb $T3,$Xn
|
||||
pxor $T1,$Xi # Ii+Xi
|
||||
___
|
||||
&clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
|
||||
$code.=<<___;
|
||||
movdqa $Xi,$Xhi #
|
||||
pshufd \$0b01001110,$Xi,$T1
|
||||
pshufd \$0b01001110,$Hkey2,$T2
|
||||
pxor $Xi,$T1 #
|
||||
pxor $Hkey2,$T2
|
||||
|
||||
lea 32($inp),$inp # i+=2
|
||||
sub \$0x20,$len
|
||||
jbe .Leven_tail
|
||||
|
||||
.Lmod_loop:
|
||||
___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
|
||||
$code.=<<___;
|
||||
movdqu ($inp),$T1 # Ii
|
||||
pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
pxor $Xhn,$Xhi
|
||||
|
||||
movdqu 16($inp),$Xn # Ii+1
|
||||
pshufb $T3,$T1
|
||||
pshufb $T3,$Xn
|
||||
|
||||
movdqa $Xn,$Xhn #
|
||||
pshufd \$0b01001110,$Xn,$T1n
|
||||
pshufd \$0b01001110,$Hkey,$T2n
|
||||
pxor $Xn,$T1n #
|
||||
pxor $Hkey,$T2n
|
||||
pxor $T1,$Xhi # "Ii+Xi", consume early
|
||||
|
||||
movdqa $Xi,$T1 # 1st phase
|
||||
psllq \$1,$Xi
|
||||
pxor $T1,$Xi #
|
||||
psllq \$5,$Xi #
|
||||
pxor $T1,$Xi #
|
||||
pclmulqdq \$0x00,$Hkey,$Xn #######
|
||||
psllq \$57,$Xi #
|
||||
movdqa $Xi,$T2 #
|
||||
pslldq \$8,$Xi
|
||||
psrldq \$8,$T2 #
|
||||
pxor $T1,$Xi
|
||||
pxor $T2,$Xhi #
|
||||
|
||||
pclmulqdq \$0x11,$Hkey,$Xhn #######
|
||||
movdqa $Xi,$T2 # 2nd phase
|
||||
psrlq \$5,$Xi
|
||||
pxor $T2,$Xi #
|
||||
psrlq \$1,$Xi #
|
||||
pxor $T2,$Xi #
|
||||
pxor $Xhi,$T2
|
||||
psrlq \$1,$Xi #
|
||||
pxor $T2,$Xi #
|
||||
|
||||
pclmulqdq \$0x00,$T2n,$T1n #######
|
||||
movdqa $Xi,$Xhi #
|
||||
pshufd \$0b01001110,$Xi,$T1
|
||||
pshufd \$0b01001110,$Hkey2,$T2
|
||||
pxor $Xi,$T1 #
|
||||
pxor $Hkey2,$T2
|
||||
|
||||
pxor $Xn,$T1n #
|
||||
pxor $Xhn,$T1n #
|
||||
movdqa $T1n,$T2n #
|
||||
psrldq \$8,$T1n
|
||||
pslldq \$8,$T2n #
|
||||
pxor $T1n,$Xhn
|
||||
pxor $T2n,$Xn #
|
||||
|
||||
lea 32($inp),$inp
|
||||
sub \$0x20,$len
|
||||
ja .Lmod_loop
|
||||
|
||||
.Leven_tail:
|
||||
___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
|
||||
$code.=<<___;
|
||||
pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
pxor $Xhn,$Xhi
|
||||
___
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
$code.=<<___;
|
||||
test $len,$len
|
||||
jnz .Ldone
|
||||
|
||||
.Lodd_tail:
|
||||
movdqu ($inp),$T1 # Ii
|
||||
pshufb $T3,$T1
|
||||
pxor $T1,$Xi # Ii+Xi
|
||||
___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
$code.=<<___;
|
||||
.Ldone:
|
||||
pshufb $T3,$Xi
|
||||
movdqu $Xi,($Xip)
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps (%rsp),%xmm6
|
||||
movaps 0x10(%rsp),%xmm7
|
||||
movaps 0x20(%rsp),%xmm8
|
||||
movaps 0x30(%rsp),%xmm9
|
||||
movaps 0x40(%rsp),%xmm10
|
||||
add \$0x58,%rsp
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.LSEH_end_gcm_ghash_clmul:
|
||||
.size gcm_ghash_clmul,.-gcm_ghash_clmul
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
.align 64
|
||||
.Lbswap_mask:
|
||||
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
||||
.L0x1c2_polynomial:
|
||||
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
||||
.align 64
|
||||
.type .Lrem_4bit,\@object
|
||||
.Lrem_4bit:
|
||||
.long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
|
||||
.long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
|
||||
.long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
|
||||
.long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
|
||||
.type .Lrem_8bit,\@object
|
||||
.Lrem_8bit:
|
||||
.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
|
||||
.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
|
||||
.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
|
||||
.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
|
||||
.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
|
||||
.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
|
||||
.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
|
||||
.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
|
||||
.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
|
||||
.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
|
||||
.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
|
||||
.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
|
||||
.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
|
||||
.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
|
||||
.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
|
||||
.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
|
||||
.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
|
||||
.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
|
||||
.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
|
||||
.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
|
||||
.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
|
||||
.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
|
||||
.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
|
||||
.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
|
||||
.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
|
||||
.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
|
||||
.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
|
||||
.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
|
||||
.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
|
||||
.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
|
||||
.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
|
||||
.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
|
||||
|
||||
.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 64
|
||||
___
|
||||
|
||||
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
||||
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
||||
if ($win64) {
|
||||
$rec="%rcx";
|
||||
$frame="%rdx";
|
||||
$context="%r8";
|
||||
$disp="%r9";
|
||||
|
||||
$code.=<<___;
|
||||
.extern __imp_RtlVirtualUnwind
|
||||
.type se_handler,\@abi-omnipotent
|
||||
.align 16
|
||||
se_handler:
|
||||
push %rsi
|
||||
push %rdi
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
pushfq
|
||||
sub \$64,%rsp
|
||||
|
||||
mov 120($context),%rax # pull context->Rax
|
||||
mov 248($context),%rbx # pull context->Rip
|
||||
|
||||
mov 8($disp),%rsi # disp->ImageBase
|
||||
mov 56($disp),%r11 # disp->HandlerData
|
||||
|
||||
mov 0(%r11),%r10d # HandlerData[0]
|
||||
lea (%rsi,%r10),%r10 # prologue label
|
||||
cmp %r10,%rbx # context->Rip<prologue label
|
||||
jb .Lin_prologue
|
||||
|
||||
mov 152($context),%rax # pull context->Rsp
|
||||
|
||||
mov 4(%r11),%r10d # HandlerData[1]
|
||||
lea (%rsi,%r10),%r10 # epilogue label
|
||||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||||
jae .Lin_prologue
|
||||
|
||||
lea 24(%rax),%rax # adjust "rsp"
|
||||
|
||||
mov -8(%rax),%rbx
|
||||
mov -16(%rax),%rbp
|
||||
mov -24(%rax),%r12
|
||||
mov %rbx,144($context) # restore context->Rbx
|
||||
mov %rbp,160($context) # restore context->Rbp
|
||||
mov %r12,216($context) # restore context->R12
|
||||
|
||||
.Lin_prologue:
|
||||
mov 8(%rax),%rdi
|
||||
mov 16(%rax),%rsi
|
||||
mov %rax,152($context) # restore context->Rsp
|
||||
mov %rsi,168($context) # restore context->Rsi
|
||||
mov %rdi,176($context) # restore context->Rdi
|
||||
|
||||
mov 40($disp),%rdi # disp->ContextRecord
|
||||
mov $context,%rsi # context
|
||||
mov \$`1232/8`,%ecx # sizeof(CONTEXT)
|
||||
.long 0xa548f3fc # cld; rep movsq
|
||||
|
||||
mov $disp,%rsi
|
||||
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
||||
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
||||
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
||||
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
||||
mov 40(%rsi),%r10 # disp->ContextRecord
|
||||
lea 56(%rsi),%r11 # &disp->HandlerData
|
||||
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
||||
mov %r10,32(%rsp) # arg5
|
||||
mov %r11,40(%rsp) # arg6
|
||||
mov %r12,48(%rsp) # arg7
|
||||
mov %rcx,56(%rsp) # arg8, (NULL)
|
||||
call *__imp_RtlVirtualUnwind(%rip)
|
||||
|
||||
mov \$1,%eax # ExceptionContinueSearch
|
||||
add \$64,%rsp
|
||||
popfq
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
pop %rdi
|
||||
pop %rsi
|
||||
ret
|
||||
.size se_handler,.-se_handler
|
||||
|
||||
.section .pdata
|
||||
.align 4
|
||||
.rva .LSEH_begin_gcm_gmult_4bit
|
||||
.rva .LSEH_end_gcm_gmult_4bit
|
||||
.rva .LSEH_info_gcm_gmult_4bit
|
||||
|
||||
.rva .LSEH_begin_gcm_ghash_4bit
|
||||
.rva .LSEH_end_gcm_ghash_4bit
|
||||
.rva .LSEH_info_gcm_ghash_4bit
|
||||
|
||||
.rva .LSEH_begin_gcm_ghash_clmul
|
||||
.rva .LSEH_end_gcm_ghash_clmul
|
||||
.rva .LSEH_info_gcm_ghash_clmul
|
||||
|
||||
.section .xdata
|
||||
.align 8
|
||||
.LSEH_info_gcm_gmult_4bit:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
.rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
|
||||
.LSEH_info_gcm_ghash_4bit:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
|
||||
.LSEH_info_gcm_ghash_clmul:
|
||||
.byte 0x01,0x1f,0x0b,0x00
|
||||
.byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
|
||||
.byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
|
||||
.byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
|
||||
.byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
|
||||
.byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
|
||||
.byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
|
||||
___
|
||||
}
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
|
||||
print $code;
|
||||
|
||||
close STDOUT;
|
|
@ -48,7 +48,8 @@
|
|||
*
|
||||
*/
|
||||
|
||||
#include "modes.h"
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
#ifndef MODES_DEBUG
|
||||
|
@ -58,12 +59,7 @@
|
|||
#endif
|
||||
#include <assert.h>
|
||||
|
||||
#define STRICT_ALIGNMENT 1
|
||||
#if defined(__i386) || defined(__i386__) || \
|
||||
defined(__x86_64) || defined(__x86_64__) || \
|
||||
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
|
||||
defined(__s390__) || defined(__s390x__)
|
||||
# undef STRICT_ALIGNMENT
|
||||
#ifndef STRICT_ALIGNMENT
|
||||
# define STRICT_ALIGNMENT 0
|
||||
#endif
|
||||
|
||||
|
|
441
crypto/modes/ccm128.c
Normal file
441
crypto/modes/ccm128.c
Normal file
|
@ -0,0 +1,441 @@
|
|||
/* ====================================================================
|
||||
* Copyright (c) 2011 The OpenSSL Project. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this
|
||||
* software must display the following acknowledgment:
|
||||
* "This product includes software developed by the OpenSSL Project
|
||||
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
|
||||
*
|
||||
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
|
||||
* endorse or promote products derived from this software without
|
||||
* prior written permission. For written permission, please contact
|
||||
* openssl-core@openssl.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "OpenSSL"
|
||||
* nor may "OpenSSL" appear in their names without prior written
|
||||
* permission of the OpenSSL Project.
|
||||
*
|
||||
* 6. Redistributions of any form whatsoever must retain the following
|
||||
* acknowledgment:
|
||||
* "This product includes software developed by the OpenSSL Project
|
||||
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
|
||||
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*/
|
||||
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
#ifndef MODES_DEBUG
|
||||
# ifndef NDEBUG
|
||||
# define NDEBUG
|
||||
# endif
|
||||
#endif
|
||||
#include <assert.h>
|
||||
|
||||
/* First you setup M and L parameters and pass the key schedule.
|
||||
* This is called once per session setup... */
|
||||
void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
|
||||
unsigned int M,unsigned int L,void *key,block128_f block)
|
||||
{
|
||||
memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
|
||||
ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
|
||||
ctx->blocks = 0;
|
||||
ctx->block = block;
|
||||
ctx->key = key;
|
||||
}
|
||||
|
||||
/* !!! Following interfaces are to be called *once* per packet !!! */
|
||||
|
||||
/* Then you setup per-message nonce and pass the length of the message */
|
||||
int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *nonce,size_t nlen,size_t mlen)
|
||||
{
|
||||
unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */
|
||||
|
||||
if (nlen<(14-L)) return -1; /* nonce is too short */
|
||||
|
||||
if (sizeof(mlen)==8 && L>=3) {
|
||||
ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8)));
|
||||
ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8)));
|
||||
ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
|
||||
ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
|
||||
}
|
||||
else
|
||||
*(u32*)(&ctx->nonce.c[8]) = 0;
|
||||
|
||||
ctx->nonce.c[12] = (u8)(mlen>>24);
|
||||
ctx->nonce.c[13] = (u8)(mlen>>16);
|
||||
ctx->nonce.c[14] = (u8)(mlen>>8);
|
||||
ctx->nonce.c[15] = (u8)mlen;
|
||||
|
||||
ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */
|
||||
memcpy(&ctx->nonce.c[1],nonce,14-L);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Then you pass additional authentication data, this is optional */
|
||||
void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *aad,size_t alen)
|
||||
{ unsigned int i;
|
||||
block128_f block = ctx->block;
|
||||
|
||||
if (alen==0) return;
|
||||
|
||||
ctx->nonce.c[0] |= 0x40; /* set Adata flag */
|
||||
(*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
|
||||
ctx->blocks++;
|
||||
|
||||
if (alen<(0x10000-0x100)) {
|
||||
ctx->cmac.c[0] ^= (u8)(alen>>8);
|
||||
ctx->cmac.c[1] ^= (u8)alen;
|
||||
i=2;
|
||||
}
|
||||
else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
|
||||
ctx->cmac.c[0] ^= 0xFF;
|
||||
ctx->cmac.c[1] ^= 0xFF;
|
||||
ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
|
||||
ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
|
||||
ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
|
||||
ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
|
||||
ctx->cmac.c[6] ^= (u8)(alen>>24);
|
||||
ctx->cmac.c[7] ^= (u8)(alen>>16);
|
||||
ctx->cmac.c[8] ^= (u8)(alen>>8);
|
||||
ctx->cmac.c[9] ^= (u8)alen;
|
||||
i=10;
|
||||
}
|
||||
else {
|
||||
ctx->cmac.c[0] ^= 0xFF;
|
||||
ctx->cmac.c[1] ^= 0xFE;
|
||||
ctx->cmac.c[2] ^= (u8)(alen>>24);
|
||||
ctx->cmac.c[3] ^= (u8)(alen>>16);
|
||||
ctx->cmac.c[4] ^= (u8)(alen>>8);
|
||||
ctx->cmac.c[5] ^= (u8)alen;
|
||||
i=6;
|
||||
}
|
||||
|
||||
do {
|
||||
for(;i<16 && alen;++i,++aad,--alen)
|
||||
ctx->cmac.c[i] ^= *aad;
|
||||
(*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
|
||||
ctx->blocks++;
|
||||
i=0;
|
||||
} while (alen);
|
||||
}
|
||||
|
||||
/* Finally you encrypt or decrypt the message */
|
||||
|
||||
/* counter part of nonce may not be larger than L*8 bits,
|
||||
* L is not larger than 8, therefore 64-bit counter... */
|
||||
static void ctr64_inc(unsigned char *counter) {
|
||||
unsigned int n=8;
|
||||
u8 c;
|
||||
|
||||
counter += 8;
|
||||
do {
|
||||
--n;
|
||||
c = counter[n];
|
||||
++c;
|
||||
counter[n] = c;
|
||||
if (c) return;
|
||||
} while (n);
|
||||
}
|
||||
|
||||
int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out,
|
||||
size_t len)
|
||||
{
|
||||
size_t n;
|
||||
unsigned int i,L;
|
||||
unsigned char flags0 = ctx->nonce.c[0];
|
||||
block128_f block = ctx->block;
|
||||
void * key = ctx->key;
|
||||
union { u64 u[2]; u8 c[16]; } scratch;
|
||||
|
||||
if (!(flags0&0x40))
|
||||
(*block)(ctx->nonce.c,ctx->cmac.c,key),
|
||||
ctx->blocks++;
|
||||
|
||||
ctx->nonce.c[0] = L = flags0&7;
|
||||
for (n=0,i=15-L;i<15;++i) {
|
||||
n |= ctx->nonce.c[i];
|
||||
ctx->nonce.c[i]=0;
|
||||
n <<= 8;
|
||||
}
|
||||
n |= ctx->nonce.c[15]; /* reconstructed length */
|
||||
ctx->nonce.c[15]=1;
|
||||
|
||||
if (n!=len) return -1; /* length mismatch */
|
||||
|
||||
ctx->blocks += ((len+15)>>3)|1;
|
||||
if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
|
||||
|
||||
while (len>=16) {
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
union { u64 u[2]; u8 c[16]; } temp;
|
||||
|
||||
memcpy (temp.c,inp,16);
|
||||
ctx->cmac.u[0] ^= temp.u[0];
|
||||
ctx->cmac.u[1] ^= temp.u[1];
|
||||
#else
|
||||
ctx->cmac.u[0] ^= ((u64*)inp)[0];
|
||||
ctx->cmac.u[1] ^= ((u64*)inp)[1];
|
||||
#endif
|
||||
(*block)(ctx->cmac.c,ctx->cmac.c,key);
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
ctr64_inc(ctx->nonce.c);
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
temp.u[0] ^= scratch.u[0];
|
||||
temp.u[1] ^= scratch.u[1];
|
||||
memcpy(out,temp.c,16);
|
||||
#else
|
||||
((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
|
||||
((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
|
||||
#endif
|
||||
inp += 16;
|
||||
out += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
if (len) {
|
||||
for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
|
||||
(*block)(ctx->cmac.c,ctx->cmac.c,key);
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
|
||||
}
|
||||
|
||||
for (i=15-L;i<16;++i)
|
||||
ctx->nonce.c[i]=0;
|
||||
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
ctx->cmac.u[0] ^= scratch.u[0];
|
||||
ctx->cmac.u[1] ^= scratch.u[1];
|
||||
|
||||
ctx->nonce.c[0] = flags0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out,
|
||||
size_t len)
|
||||
{
|
||||
size_t n;
|
||||
unsigned int i,L;
|
||||
unsigned char flags0 = ctx->nonce.c[0];
|
||||
block128_f block = ctx->block;
|
||||
void * key = ctx->key;
|
||||
union { u64 u[2]; u8 c[16]; } scratch;
|
||||
|
||||
if (!(flags0&0x40))
|
||||
(*block)(ctx->nonce.c,ctx->cmac.c,key);
|
||||
|
||||
ctx->nonce.c[0] = L = flags0&7;
|
||||
for (n=0,i=15-L;i<15;++i) {
|
||||
n |= ctx->nonce.c[i];
|
||||
ctx->nonce.c[i]=0;
|
||||
n <<= 8;
|
||||
}
|
||||
n |= ctx->nonce.c[15]; /* reconstructed length */
|
||||
ctx->nonce.c[15]=1;
|
||||
|
||||
if (n!=len) return -1;
|
||||
|
||||
while (len>=16) {
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
union { u64 u[2]; u8 c[16]; } temp;
|
||||
#endif
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
ctr64_inc(ctx->nonce.c);
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
memcpy (temp.c,inp,16);
|
||||
ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
|
||||
ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
|
||||
memcpy (out,scratch.c,16);
|
||||
#else
|
||||
ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
|
||||
ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
|
||||
#endif
|
||||
(*block)(ctx->cmac.c,ctx->cmac.c,key);
|
||||
|
||||
inp += 16;
|
||||
out += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
if (len) {
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
for (i=0; i<len; ++i)
|
||||
ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
|
||||
(*block)(ctx->cmac.c,ctx->cmac.c,key);
|
||||
}
|
||||
|
||||
for (i=15-L;i<16;++i)
|
||||
ctx->nonce.c[i]=0;
|
||||
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
ctx->cmac.u[0] ^= scratch.u[0];
|
||||
ctx->cmac.u[1] ^= scratch.u[1];
|
||||
|
||||
ctx->nonce.c[0] = flags0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ctr64_add (unsigned char *counter,size_t inc)
|
||||
{ size_t n=8, val=0;
|
||||
|
||||
counter += 8;
|
||||
do {
|
||||
--n;
|
||||
val += counter[n] + (inc&0xff);
|
||||
counter[n] = (unsigned char)val;
|
||||
val >>= 8; /* carry bit */
|
||||
inc >>= 8;
|
||||
} while(n && (inc || val));
|
||||
}
|
||||
|
||||
int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out,
|
||||
size_t len,ccm128_f stream)
|
||||
{
|
||||
size_t n;
|
||||
unsigned int i,L;
|
||||
unsigned char flags0 = ctx->nonce.c[0];
|
||||
block128_f block = ctx->block;
|
||||
void * key = ctx->key;
|
||||
union { u64 u[2]; u8 c[16]; } scratch;
|
||||
|
||||
if (!(flags0&0x40))
|
||||
(*block)(ctx->nonce.c,ctx->cmac.c,key),
|
||||
ctx->blocks++;
|
||||
|
||||
ctx->nonce.c[0] = L = flags0&7;
|
||||
for (n=0,i=15-L;i<15;++i) {
|
||||
n |= ctx->nonce.c[i];
|
||||
ctx->nonce.c[i]=0;
|
||||
n <<= 8;
|
||||
}
|
||||
n |= ctx->nonce.c[15]; /* reconstructed length */
|
||||
ctx->nonce.c[15]=1;
|
||||
|
||||
if (n!=len) return -1; /* length mismatch */
|
||||
|
||||
ctx->blocks += ((len+15)>>3)|1;
|
||||
if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
|
||||
|
||||
if ((n=len/16)) {
|
||||
(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
|
||||
n *= 16;
|
||||
inp += n;
|
||||
out += n;
|
||||
len -= n;
|
||||
if (len) ctr64_add(ctx->nonce.c,n/16);
|
||||
}
|
||||
|
||||
if (len) {
|
||||
for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
|
||||
(*block)(ctx->cmac.c,ctx->cmac.c,key);
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
|
||||
}
|
||||
|
||||
for (i=15-L;i<16;++i)
|
||||
ctx->nonce.c[i]=0;
|
||||
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
ctx->cmac.u[0] ^= scratch.u[0];
|
||||
ctx->cmac.u[1] ^= scratch.u[1];
|
||||
|
||||
ctx->nonce.c[0] = flags0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out,
|
||||
size_t len,ccm128_f stream)
|
||||
{
|
||||
size_t n;
|
||||
unsigned int i,L;
|
||||
unsigned char flags0 = ctx->nonce.c[0];
|
||||
block128_f block = ctx->block;
|
||||
void * key = ctx->key;
|
||||
union { u64 u[2]; u8 c[16]; } scratch;
|
||||
|
||||
if (!(flags0&0x40))
|
||||
(*block)(ctx->nonce.c,ctx->cmac.c,key);
|
||||
|
||||
ctx->nonce.c[0] = L = flags0&7;
|
||||
for (n=0,i=15-L;i<15;++i) {
|
||||
n |= ctx->nonce.c[i];
|
||||
ctx->nonce.c[i]=0;
|
||||
n <<= 8;
|
||||
}
|
||||
n |= ctx->nonce.c[15]; /* reconstructed length */
|
||||
ctx->nonce.c[15]=1;
|
||||
|
||||
if (n!=len) return -1;
|
||||
|
||||
if ((n=len/16)) {
|
||||
(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
|
||||
n *= 16;
|
||||
inp += n;
|
||||
out += n;
|
||||
len -= n;
|
||||
if (len) ctr64_add(ctx->nonce.c,n/16);
|
||||
}
|
||||
|
||||
if (len) {
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
for (i=0; i<len; ++i)
|
||||
ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
|
||||
(*block)(ctx->cmac.c,ctx->cmac.c,key);
|
||||
}
|
||||
|
||||
for (i=15-L;i<16;++i)
|
||||
ctx->nonce.c[i]=0;
|
||||
|
||||
(*block)(ctx->nonce.c,scratch.c,key);
|
||||
ctx->cmac.u[0] ^= scratch.u[0];
|
||||
ctx->cmac.u[1] ^= scratch.u[1];
|
||||
|
||||
ctx->nonce.c[0] = flags0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
|
||||
{ unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */
|
||||
|
||||
M *= 2; M += 2;
|
||||
if (len<M) return 0;
|
||||
memcpy(tag,ctx->cmac.c,M);
|
||||
return M;
|
||||
}
|
|
@ -48,7 +48,8 @@
|
|||
*
|
||||
*/
|
||||
|
||||
#include "modes.h"
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
#ifndef MODES_DEBUG
|
||||
|
@ -58,14 +59,6 @@
|
|||
#endif
|
||||
#include <assert.h>
|
||||
|
||||
#define STRICT_ALIGNMENT
|
||||
#if defined(__i386) || defined(__i386__) || \
|
||||
defined(__x86_64) || defined(__x86_64__) || \
|
||||
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
|
||||
defined(__s390__) || defined(__s390x__)
|
||||
# undef STRICT_ALIGNMENT
|
||||
#endif
|
||||
|
||||
/* The input and output encrypted as though 128bit cfb mode is being
|
||||
* used. The extra state information to record how much of the
|
||||
* 128bit block we have used is contained in *num;
|
||||
|
|
|
@ -48,7 +48,8 @@
|
|||
*
|
||||
*/
|
||||
|
||||
#include "modes.h"
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
#ifndef MODES_DEBUG
|
||||
|
@ -58,17 +59,6 @@
|
|||
#endif
|
||||
#include <assert.h>
|
||||
|
||||
typedef unsigned int u32;
|
||||
typedef unsigned char u8;
|
||||
|
||||
#define STRICT_ALIGNMENT
|
||||
#if defined(__i386) || defined(__i386__) || \
|
||||
defined(__x86_64) || defined(__x86_64__) || \
|
||||
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
|
||||
defined(__s390__) || defined(__s390x__)
|
||||
# undef STRICT_ALIGNMENT
|
||||
#endif
|
||||
|
||||
/* NOTE: the IV/counter CTR mode is big-endian. The code itself
|
||||
* is endian-neutral. */
|
||||
|
||||
|
@ -183,9 +173,6 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
|
|||
*num=n;
|
||||
}
|
||||
|
||||
#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
|
||||
#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
|
||||
|
||||
/* increment upper 96 bits of 128-bit counter by 1 */
|
||||
static void ctr96_inc(unsigned char *counter) {
|
||||
u32 n=12;
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
* forms are granted according to the OpenSSL license.
|
||||
*/
|
||||
|
||||
#include "modes.h"
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
#ifndef MODES_DEBUG
|
||||
|
@ -23,8 +24,9 @@
|
|||
* deviates from mentioned RFCs. Most notably it allows input to be
|
||||
* of block length and it doesn't flip the order of the last two
|
||||
* blocks. CTS is being discussed even in ECB context, but it's not
|
||||
* adopted for any known application. This implementation complies
|
||||
* with mentioned RFCs and [as such] extends CBC mode.
|
||||
* adopted for any known application. This implementation provides
|
||||
* two interfaces: one compliant with above mentioned RFCs and one
|
||||
* compliant with the NIST proposal, both extending CBC mode.
|
||||
*/
|
||||
|
||||
size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
|
||||
|
@ -54,6 +56,34 @@ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
|
|||
return len+residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], block128_f block)
|
||||
{ size_t residue, n;
|
||||
|
||||
assert (in && out && key && ivec);
|
||||
|
||||
if (len < 16) return 0;
|
||||
|
||||
residue=len%16;
|
||||
|
||||
len -= residue;
|
||||
|
||||
CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
|
||||
|
||||
if (residue==0) return len;
|
||||
|
||||
in += len;
|
||||
out += len;
|
||||
|
||||
for (n=0; n<residue; ++n)
|
||||
ivec[n] ^= in[n];
|
||||
(*block)(ivec,ivec,key);
|
||||
memcpy(out-16+residue,ivec,16);
|
||||
|
||||
return len+residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], cbc128_f cbc)
|
||||
|
@ -90,6 +120,41 @@ size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
|
|||
return len+residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], cbc128_f cbc)
|
||||
{ size_t residue;
|
||||
union { size_t align; unsigned char c[16]; } tmp;
|
||||
|
||||
assert (in && out && key && ivec);
|
||||
|
||||
if (len < 16) return 0;
|
||||
|
||||
residue=len%16;
|
||||
|
||||
len -= residue;
|
||||
|
||||
(*cbc)(in,out,len,key,ivec,1);
|
||||
|
||||
if (residue==0) return len;
|
||||
|
||||
in += len;
|
||||
out += len;
|
||||
|
||||
#if defined(CBC_HANDLES_TRUNCATED_IO)
|
||||
(*cbc)(in,out-16+residue,residue,key,ivec,1);
|
||||
#else
|
||||
{
|
||||
size_t n;
|
||||
for (n=0; n<16; n+=sizeof(size_t))
|
||||
*(size_t *)(tmp.c+n) = 0;
|
||||
memcpy(tmp.c,in,residue);
|
||||
}
|
||||
(*cbc)(tmp.c,out-16+residue,16,key,ivec,1);
|
||||
#endif
|
||||
return len+residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], block128_f block)
|
||||
|
@ -125,7 +190,51 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
|
|||
for(residue+=16; n<residue; ++n)
|
||||
out[n] = tmp.c[n] ^ in[n];
|
||||
|
||||
return len+residue-16;
|
||||
return 16+len+residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], block128_f block)
|
||||
{ size_t residue, n;
|
||||
union { size_t align; unsigned char c[32]; } tmp;
|
||||
|
||||
assert (in && out && key && ivec);
|
||||
|
||||
if (len<16) return 0;
|
||||
|
||||
residue=len%16;
|
||||
|
||||
if (residue==0) {
|
||||
CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
|
||||
return len;
|
||||
}
|
||||
|
||||
len -= 16+residue;
|
||||
|
||||
if (len) {
|
||||
CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
|
||||
in += len;
|
||||
out += len;
|
||||
}
|
||||
|
||||
(*block)(in+residue,tmp.c+16,key);
|
||||
|
||||
for (n=0; n<16; n+=sizeof(size_t))
|
||||
*(size_t *)(tmp.c+n) = *(size_t *)(tmp.c+16+n);
|
||||
memcpy(tmp.c,in,residue);
|
||||
(*block)(tmp.c,tmp.c,key);
|
||||
|
||||
for(n=0; n<16; ++n) {
|
||||
unsigned char c = in[n];
|
||||
out[n] = tmp.c[n] ^ ivec[n];
|
||||
ivec[n] = in[n+residue];
|
||||
tmp.c[n] = c;
|
||||
}
|
||||
for(residue+=16; n<residue; ++n)
|
||||
out[n] = tmp.c[n] ^ tmp.c[n-16];
|
||||
|
||||
return 16+len+residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
|
||||
|
@ -160,7 +269,47 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
|
|||
(*cbc)(tmp.c,tmp.c,32,key,ivec,0);
|
||||
memcpy(out,tmp.c,16+residue);
|
||||
#endif
|
||||
return len+residue;
|
||||
return 16+len+residue;
|
||||
}
|
||||
|
||||
size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], cbc128_f cbc)
|
||||
{ size_t residue, n;
|
||||
union { size_t align; unsigned char c[32]; } tmp;
|
||||
|
||||
assert (in && out && key && ivec);
|
||||
|
||||
if (len<16) return 0;
|
||||
|
||||
residue=len%16;
|
||||
|
||||
if (residue==0) {
|
||||
(*cbc)(in,out,len,key,ivec,0);
|
||||
return len;
|
||||
}
|
||||
|
||||
len -= 16+residue;
|
||||
|
||||
if (len) {
|
||||
(*cbc)(in,out,len,key,ivec,0);
|
||||
in += len;
|
||||
out += len;
|
||||
}
|
||||
|
||||
for (n=16; n<32; n+=sizeof(size_t))
|
||||
*(size_t *)(tmp.c+n) = 0;
|
||||
/* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
|
||||
(*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0);
|
||||
|
||||
memcpy(tmp.c,in,residue);
|
||||
#if defined(CBC_HANDLES_TRUNCATED_IO)
|
||||
(*cbc)(tmp.c,out,16+residue,key,ivec,0);
|
||||
#else
|
||||
(*cbc)(tmp.c,tmp.c,32,key,ivec,0);
|
||||
memcpy(out,tmp.c,16+residue);
|
||||
#endif
|
||||
return 16+len+residue;
|
||||
}
|
||||
|
||||
#if defined(SELFTEST)
|
||||
|
@ -200,9 +349,8 @@ static const unsigned char vector_64[64] =
|
|||
static AES_KEY encks, decks;
|
||||
|
||||
void test_vector(const unsigned char *vector,size_t len)
|
||||
{ unsigned char cleartext[64];
|
||||
unsigned char iv[sizeof(test_iv)];
|
||||
unsigned char ciphertext[64];
|
||||
{ unsigned char iv[sizeof(test_iv)];
|
||||
unsigned char cleartext[64],ciphertext[64];
|
||||
size_t tail;
|
||||
|
||||
printf("vector_%d\n",len); fflush(stdout);
|
||||
|
@ -243,7 +391,57 @@ void test_vector(const unsigned char *vector,size_t len)
|
|||
fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
|
||||
}
|
||||
|
||||
main()
|
||||
void test_nistvector(const unsigned char *vector,size_t len)
|
||||
{ unsigned char iv[sizeof(test_iv)];
|
||||
unsigned char cleartext[64],ciphertext[64],nistvector[64];
|
||||
size_t tail;
|
||||
|
||||
printf("nistvector_%d\n",len); fflush(stdout);
|
||||
|
||||
if ((tail=len%16) == 0) tail = 16;
|
||||
|
||||
len -= 16 + tail;
|
||||
memcpy(nistvector,vector,len);
|
||||
/* flip two last blocks */
|
||||
memcpy(nistvector+len,vector+len+16,tail);
|
||||
memcpy(nistvector+len+tail,vector+len,16);
|
||||
len += 16 + tail;
|
||||
tail = 16;
|
||||
|
||||
/* test block-based encryption */
|
||||
memcpy(iv,test_iv,sizeof(test_iv));
|
||||
CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
|
||||
if (memcmp(ciphertext,nistvector,len))
|
||||
fprintf(stderr,"output_%d mismatch\n",len), exit(1);
|
||||
if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
|
||||
fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
|
||||
|
||||
/* test block-based decryption */
|
||||
memcpy(iv,test_iv,sizeof(test_iv));
|
||||
CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
|
||||
if (memcmp(cleartext,test_input,len))
|
||||
fprintf(stderr,"input_%d mismatch\n",len), exit(2);
|
||||
if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
|
||||
fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
|
||||
|
||||
/* test streamed encryption */
|
||||
memcpy(iv,test_iv,sizeof(test_iv));
|
||||
CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
|
||||
if (memcmp(ciphertext,nistvector,len))
|
||||
fprintf(stderr,"output_%d mismatch\n",len), exit(3);
|
||||
if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
|
||||
fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
|
||||
|
||||
/* test streamed decryption */
|
||||
memcpy(iv,test_iv,sizeof(test_iv));
|
||||
CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
|
||||
if (memcmp(cleartext,test_input,len))
|
||||
fprintf(stderr,"input_%d mismatch\n",len), exit(4);
|
||||
if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
|
||||
fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
AES_set_encrypt_key(test_key,128,&encks);
|
||||
AES_set_decrypt_key(test_key,128,&decks);
|
||||
|
@ -254,6 +452,14 @@ main()
|
|||
test_vector(vector_47,sizeof(vector_47));
|
||||
test_vector(vector_48,sizeof(vector_48));
|
||||
test_vector(vector_64,sizeof(vector_64));
|
||||
exit(0);
|
||||
|
||||
test_nistvector(vector_17,sizeof(vector_17));
|
||||
test_nistvector(vector_31,sizeof(vector_31));
|
||||
test_nistvector(vector_32,sizeof(vector_32));
|
||||
test_nistvector(vector_47,sizeof(vector_47));
|
||||
test_nistvector(vector_48,sizeof(vector_48));
|
||||
test_nistvector(vector_64,sizeof(vector_64));
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -19,6 +19,10 @@ typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
|
|||
size_t blocks, const void *key,
|
||||
const unsigned char ivec[16]);
|
||||
|
||||
typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
|
||||
size_t blocks, const void *key,
|
||||
const unsigned char ivec[16],unsigned char cmac[16]);
|
||||
|
||||
void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], block128_f block);
|
||||
|
@ -67,6 +71,19 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
|
|||
size_t len, const void *key,
|
||||
unsigned char ivec[16], cbc128_f cbc);
|
||||
|
||||
size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], block128_f block);
|
||||
size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], cbc128_f cbc);
|
||||
size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], block128_f block);
|
||||
size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
|
||||
size_t len, const void *key,
|
||||
unsigned char ivec[16], cbc128_f cbc);
|
||||
|
||||
typedef struct gcm128_context GCM128_CONTEXT;
|
||||
|
||||
GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
|
||||
|
@ -91,3 +108,28 @@ int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
|
|||
size_t len);
|
||||
void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
|
||||
void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
|
||||
|
||||
typedef struct ccm128_context CCM128_CONTEXT;
|
||||
|
||||
void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
|
||||
unsigned int M, unsigned int L, void *key,block128_f block);
|
||||
int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *nonce, size_t nlen, size_t mlen);
|
||||
void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *aad, size_t alen);
|
||||
int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out, size_t len);
|
||||
int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out, size_t len);
|
||||
int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out, size_t len,
|
||||
ccm128_f stream);
|
||||
int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
|
||||
const unsigned char *inp, unsigned char *out, size_t len,
|
||||
ccm128_f stream);
|
||||
size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
|
||||
|
||||
typedef struct xts128_context XTS128_CONTEXT;
|
||||
|
||||
int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
|
||||
const unsigned char *inp, unsigned char *out, size_t len, int enc);
|
||||
|
|
|
@ -48,7 +48,8 @@
|
|||
*
|
||||
*/
|
||||
|
||||
#include "modes.h"
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
#ifndef MODES_DEBUG
|
||||
|
@ -58,14 +59,6 @@
|
|||
#endif
|
||||
#include <assert.h>
|
||||
|
||||
#define STRICT_ALIGNMENT
|
||||
#if defined(__i386) || defined(__i386__) || \
|
||||
defined(__x86_64) || defined(__x86_64__) || \
|
||||
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
|
||||
defined(__s390__) || defined(__s390x__)
|
||||
# undef STRICT_ALIGNMENT
|
||||
#endif
|
||||
|
||||
/* The input and output encrypted as though 128bit ofb mode is being
|
||||
* used. The extra state information to record how much of the
|
||||
* 128bit block we have used is contained in *num;
|
||||
|
|
187
crypto/modes/xts128.c
Normal file
187
crypto/modes/xts128.c
Normal file
|
@ -0,0 +1,187 @@
|
|||
/* ====================================================================
|
||||
* Copyright (c) 2011 The OpenSSL Project. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. All advertising materials mentioning features or use of this
|
||||
* software must display the following acknowledgment:
|
||||
* "This product includes software developed by the OpenSSL Project
|
||||
* for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
|
||||
*
|
||||
* 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
|
||||
* endorse or promote products derived from this software without
|
||||
* prior written permission. For written permission, please contact
|
||||
* openssl-core@openssl.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "OpenSSL"
|
||||
* nor may "OpenSSL" appear in their names without prior written
|
||||
* permission of the OpenSSL Project.
|
||||
*
|
||||
* 6. Redistributions of any form whatsoever must retain the following
|
||||
* acknowledgment:
|
||||
* "This product includes software developed by the OpenSSL Project
|
||||
* for use in the OpenSSL Toolkit (http://www.openssl.org/)"
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
|
||||
* EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*/
|
||||
|
||||
#include <openssl/crypto.h>
|
||||
#include "modes_lcl.h"
|
||||
#include <string.h>
|
||||
|
||||
#ifndef MODES_DEBUG
|
||||
# ifndef NDEBUG
|
||||
# define NDEBUG
|
||||
# endif
|
||||
#endif
|
||||
#include <assert.h>
|
||||
|
||||
int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
|
||||
const unsigned char *inp, unsigned char *out,
|
||||
size_t len, int enc)
|
||||
{
|
||||
const union { long one; char little; } is_endian = {1};
|
||||
union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
|
||||
unsigned int i;
|
||||
|
||||
if (len<16) return -1;
|
||||
|
||||
memcpy(tweak.c, iv, 16);
|
||||
|
||||
(*ctx->block2)(tweak.c,tweak.c,ctx->key2);
|
||||
|
||||
if (!enc && (len%16)) len-=16;
|
||||
|
||||
while (len>=16) {
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
memcpy(scratch.c,inp,16);
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
#else
|
||||
scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
|
||||
scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
|
||||
#endif
|
||||
(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
memcpy(out,scratch.c,16);
|
||||
#else
|
||||
((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
|
||||
((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
|
||||
#endif
|
||||
inp += 16;
|
||||
out += 16;
|
||||
len -= 16;
|
||||
|
||||
if (len==0) return 0;
|
||||
|
||||
if (is_endian.little) {
|
||||
unsigned int carry,res;
|
||||
|
||||
res = 0x87&(((int)tweak.d[3])>>31);
|
||||
carry = (unsigned int)(tweak.u[0]>>63);
|
||||
tweak.u[0] = (tweak.u[0]<<1)^res;
|
||||
tweak.u[1] = (tweak.u[1]<<1)|carry;
|
||||
}
|
||||
else {
|
||||
size_t c;
|
||||
|
||||
for (c=0,i=0;i<16;++i) {
|
||||
/*+ substitutes for |, because c is 1 bit */
|
||||
c += ((size_t)tweak.c[i])<<1;
|
||||
tweak.c[i] = (u8)c;
|
||||
c = c>>8;
|
||||
}
|
||||
tweak.c[0] ^= (u8)(0x87&(0-c));
|
||||
}
|
||||
}
|
||||
if (enc) {
|
||||
for (i=0;i<len;++i) {
|
||||
u8 c = inp[i];
|
||||
out[i] = scratch.c[i];
|
||||
scratch.c[i] = c;
|
||||
}
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
memcpy(out-16,scratch.c,16);
|
||||
}
|
||||
else {
|
||||
union { u64 u[2]; u8 c[16]; } tweak1;
|
||||
|
||||
if (is_endian.little) {
|
||||
unsigned int carry,res;
|
||||
|
||||
res = 0x87&(((int)tweak.d[3])>>31);
|
||||
carry = (unsigned int)(tweak.u[0]>>63);
|
||||
tweak1.u[0] = (tweak.u[0]<<1)^res;
|
||||
tweak1.u[1] = (tweak.u[1]<<1)|carry;
|
||||
}
|
||||
else {
|
||||
size_t c;
|
||||
|
||||
for (c=0,i=0;i<16;++i) {
|
||||
/*+ substitutes for |, because c is 1 bit */
|
||||
c += ((size_t)tweak.c[i])<<1;
|
||||
tweak1.c[i] = (u8)c;
|
||||
c = c>>8;
|
||||
}
|
||||
tweak1.c[0] ^= (u8)(0x87&(0-c));
|
||||
}
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
memcpy(scratch.c,inp,16);
|
||||
scratch.u[0] ^= tweak1.u[0];
|
||||
scratch.u[1] ^= tweak1.u[1];
|
||||
#else
|
||||
scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
|
||||
scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
|
||||
#endif
|
||||
(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
|
||||
scratch.u[0] ^= tweak1.u[0];
|
||||
scratch.u[1] ^= tweak1.u[1];
|
||||
|
||||
for (i=0;i<len;++i) {
|
||||
u8 c = inp[16+i];
|
||||
out[16+i] = scratch.c[i];
|
||||
scratch.c[i] = c;
|
||||
}
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
|
||||
#if defined(STRICT_ALIGNMENT)
|
||||
scratch.u[0] ^= tweak.u[0];
|
||||
scratch.u[1] ^= tweak.u[1];
|
||||
memcpy (out,scratch.c,16);
|
||||
#else
|
||||
((u64*)out)[0] = scratch.u[0]^tweak.u[0];
|
||||
((u64*)out)[1] = scratch.u[1]^tweak.u[1];
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in a new issue