19a6e8b32c
I was wrong in conclusions about when addition starts overflowing in combaX routines.
2201 lines
37 KiB
ArmAsm
2201 lines
37 KiB
ArmAsm
.rdata
|
|
.asciiz "mips3.s, Version 1.1"
|
|
.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
|
|
|
|
/*
|
|
* ====================================================================
|
|
* Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
|
* project.
|
|
*
|
|
* Rights for redistribution and usage in source and binary forms are
|
|
* granted according to the OpenSSL license. Warranty of any kind is
|
|
* disclaimed.
|
|
* ====================================================================
|
|
*/
|
|
|
|
/*
|
|
* This is my modest contributon to the OpenSSL project (see
|
|
* http://www.openssl.org/ for more information about it) and is
|
|
* a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c
|
|
* module. For updates see http://fy.chalmers.se/~appro/hpe/.
|
|
*
|
|
* The module is designed to work with either of the "new" MIPS ABI(5),
|
|
* namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
|
|
* IRIX 5.x not only because it doesn't support new ABIs but also
|
|
* because 5.x kernels put R4x00 CPU into 32-bit mode and all those
|
|
* 64-bit instructions (daddu, dmultu, etc.) found below gonna only
|
|
* cause illegal instruction exception:-(
|
|
*
|
|
* In addition the code depends on preprocessor flags set up by MIPSpro
|
|
* compiler driver (either as or cc) and therefore (probably?) can't be
|
|
* compiled by the GNU assembler. GNU C driver manages fine though...
|
|
* I mean as long as -mmips-as is specified or is the default option,
|
|
* because then it simply invokes /usr/bin/as which in turn takes
|
|
* perfect care of the preprocessor definitions. Another neat feature
|
|
* offered by the MIPSpro assembler is an optimization pass. This gave
|
|
* me the opportunity to have the code looking more regular as all those
|
|
* architecture dependent instruction rescheduling details were left to
|
|
* the assembler. Cool, huh?
|
|
*
|
|
* Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
|
|
* goes way over 3 times faster!
|
|
*
|
|
* <appro@fy.chalmers.se>
|
|
*/
|
|
#include <asm.h>
|
|
#include <regdef.h>
|
|
|
|
#if _MIPS_ISA>=4
|
|
#define MOVNZ(cond,dst,src) \
|
|
movn dst,src,cond
|
|
#else
|
|
#define MOVNZ(cond,dst,src) \
|
|
.set noreorder; \
|
|
bnezl cond,.+8; \
|
|
move dst,src; \
|
|
.set reorder
|
|
#endif
|
|
|
|
.text
|
|
|
|
.set noat
|
|
.set reorder
|
|
|
|
#define MINUS4 v1
|
|
|
|
.align 5
|
|
LEAF(bn_mul_add_words)
|
|
.set noreorder
|
|
bgtzl a2,.L_bn_mul_add_words_proceed
|
|
ld t0,0(a1)
|
|
jr ra
|
|
move v0,zero
|
|
.set reorder
|
|
|
|
.L_bn_mul_add_words_proceed:
|
|
li MINUS4,-4
|
|
and ta0,a2,MINUS4
|
|
move v0,zero
|
|
beqz ta0,.L_bn_mul_add_words_tail
|
|
|
|
.L_bn_mul_add_words_loop:
|
|
dmultu t0,a3
|
|
ld t1,0(a0)
|
|
ld t2,8(a1)
|
|
ld t3,8(a0)
|
|
ld ta0,16(a1)
|
|
ld ta1,16(a0)
|
|
daddu t1,v0
|
|
sltu v0,t1,v0 /* All manuals say it "compares 32-bit
|
|
* values", but it seems to work fine
|
|
* even on 64-bit registers. */
|
|
mflo AT
|
|
mfhi t0
|
|
daddu t1,AT
|
|
daddu v0,t0
|
|
sltu AT,t1,AT
|
|
sd t1,0(a0)
|
|
daddu v0,AT
|
|
|
|
dmultu t2,a3
|
|
ld ta2,24(a1)
|
|
ld ta3,24(a0)
|
|
daddu t3,v0
|
|
sltu v0,t3,v0
|
|
mflo AT
|
|
mfhi t2
|
|
daddu t3,AT
|
|
daddu v0,t2
|
|
sltu AT,t3,AT
|
|
sd t3,8(a0)
|
|
daddu v0,AT
|
|
|
|
dmultu ta0,a3
|
|
subu a2,4
|
|
PTR_ADD a0,32
|
|
PTR_ADD a1,32
|
|
daddu ta1,v0
|
|
sltu v0,ta1,v0
|
|
mflo AT
|
|
mfhi ta0
|
|
daddu ta1,AT
|
|
daddu v0,ta0
|
|
sltu AT,ta1,AT
|
|
sd ta1,-16(a0)
|
|
daddu v0,AT
|
|
|
|
|
|
dmultu ta2,a3
|
|
and ta0,a2,MINUS4
|
|
daddu ta3,v0
|
|
sltu v0,ta3,v0
|
|
mflo AT
|
|
mfhi ta2
|
|
daddu ta3,AT
|
|
daddu v0,ta2
|
|
sltu AT,ta3,AT
|
|
sd ta3,-8(a0)
|
|
daddu v0,AT
|
|
.set noreorder
|
|
bgtzl ta0,.L_bn_mul_add_words_loop
|
|
ld t0,0(a1)
|
|
|
|
bnezl a2,.L_bn_mul_add_words_tail
|
|
ld t0,0(a1)
|
|
.set reorder
|
|
|
|
.L_bn_mul_add_words_return:
|
|
jr ra
|
|
|
|
.L_bn_mul_add_words_tail:
|
|
dmultu t0,a3
|
|
ld t1,0(a0)
|
|
subu a2,1
|
|
daddu t1,v0
|
|
sltu v0,t1,v0
|
|
mflo AT
|
|
mfhi t0
|
|
daddu t1,AT
|
|
daddu v0,t0
|
|
sltu AT,t1,AT
|
|
sd t1,0(a0)
|
|
daddu v0,AT
|
|
beqz a2,.L_bn_mul_add_words_return
|
|
|
|
ld t0,8(a1)
|
|
dmultu t0,a3
|
|
ld t1,8(a0)
|
|
subu a2,1
|
|
daddu t1,v0
|
|
sltu v0,t1,v0
|
|
mflo AT
|
|
mfhi t0
|
|
daddu t1,AT
|
|
daddu v0,t0
|
|
sltu AT,t1,AT
|
|
sd t1,8(a0)
|
|
daddu v0,AT
|
|
beqz a2,.L_bn_mul_add_words_return
|
|
|
|
ld t0,16(a1)
|
|
dmultu t0,a3
|
|
ld t1,16(a0)
|
|
daddu t1,v0
|
|
sltu v0,t1,v0
|
|
mflo AT
|
|
mfhi t0
|
|
daddu t1,AT
|
|
daddu v0,t0
|
|
sltu AT,t1,AT
|
|
sd t1,16(a0)
|
|
daddu v0,AT
|
|
jr ra
|
|
END(bn_mul_add_words)
|
|
|
|
.align 5
|
|
LEAF(bn_mul_words)
|
|
.set noreorder
|
|
bgtzl a2,.L_bn_mul_words_proceed
|
|
ld t0,0(a1)
|
|
jr ra
|
|
move v0,zero
|
|
.set reorder
|
|
|
|
.L_bn_mul_words_proceed:
|
|
li MINUS4,-4
|
|
and ta0,a2,MINUS4
|
|
move v0,zero
|
|
beqz ta0,.L_bn_mul_words_tail
|
|
|
|
.L_bn_mul_words_loop:
|
|
dmultu t0,a3
|
|
ld t2,8(a1)
|
|
ld ta0,16(a1)
|
|
ld ta2,24(a1)
|
|
mflo AT
|
|
mfhi t0
|
|
daddu v0,AT
|
|
sltu t1,v0,AT
|
|
sd v0,0(a0)
|
|
daddu v0,t1,t0
|
|
|
|
dmultu t2,a3
|
|
subu a2,4
|
|
PTR_ADD a0,32
|
|
PTR_ADD a1,32
|
|
mflo AT
|
|
mfhi t2
|
|
daddu v0,AT
|
|
sltu t3,v0,AT
|
|
sd v0,-24(a0)
|
|
daddu v0,t3,t2
|
|
|
|
dmultu ta0,a3
|
|
mflo AT
|
|
mfhi ta0
|
|
daddu v0,AT
|
|
sltu ta1,v0,AT
|
|
sd v0,-16(a0)
|
|
daddu v0,ta1,ta0
|
|
|
|
|
|
dmultu ta2,a3
|
|
and ta0,a2,MINUS4
|
|
mflo AT
|
|
mfhi ta2
|
|
daddu v0,AT
|
|
sltu ta3,v0,AT
|
|
sd v0,-8(a0)
|
|
daddu v0,ta3,ta2
|
|
.set noreorder
|
|
bgtzl ta0,.L_bn_mul_words_loop
|
|
ld t0,0(a1)
|
|
|
|
bnezl a2,.L_bn_mul_words_tail
|
|
ld t0,0(a1)
|
|
.set reorder
|
|
|
|
.L_bn_mul_words_return:
|
|
jr ra
|
|
|
|
.L_bn_mul_words_tail:
|
|
dmultu t0,a3
|
|
subu a2,1
|
|
mflo AT
|
|
mfhi t0
|
|
daddu v0,AT
|
|
sltu t1,v0,AT
|
|
sd v0,0(a0)
|
|
daddu v0,t1,t0
|
|
beqz a2,.L_bn_mul_words_return
|
|
|
|
ld t0,8(a1)
|
|
dmultu t0,a3
|
|
subu a2,1
|
|
mflo AT
|
|
mfhi t0
|
|
daddu v0,AT
|
|
sltu t1,v0,AT
|
|
sd v0,8(a0)
|
|
daddu v0,t1,t0
|
|
beqz a2,.L_bn_mul_words_return
|
|
|
|
ld t0,16(a1)
|
|
dmultu t0,a3
|
|
mflo AT
|
|
mfhi t0
|
|
daddu v0,AT
|
|
sltu t1,v0,AT
|
|
sd v0,16(a0)
|
|
daddu v0,t1,t0
|
|
jr ra
|
|
END(bn_mul_words)
|
|
|
|
.align 5
|
|
LEAF(bn_sqr_words)
|
|
.set noreorder
|
|
bgtzl a2,.L_bn_sqr_words_proceed
|
|
ld t0,0(a1)
|
|
jr ra
|
|
move v0,zero
|
|
.set reorder
|
|
|
|
.L_bn_sqr_words_proceed:
|
|
li MINUS4,-4
|
|
and ta0,a2,MINUS4
|
|
move v0,zero
|
|
beqz ta0,.L_bn_sqr_words_tail
|
|
|
|
.L_bn_sqr_words_loop:
|
|
dmultu t0,t0
|
|
ld t2,8(a1)
|
|
ld ta0,16(a1)
|
|
ld ta2,24(a1)
|
|
mflo t1
|
|
mfhi t0
|
|
sd t1,0(a0)
|
|
sd t0,8(a0)
|
|
|
|
dmultu t2,t2
|
|
subu a2,4
|
|
PTR_ADD a0,64
|
|
PTR_ADD a1,32
|
|
mflo t3
|
|
mfhi t2
|
|
sd t3,-48(a0)
|
|
sd t2,-40(a0)
|
|
|
|
dmultu ta0,ta0
|
|
mflo ta1
|
|
mfhi ta0
|
|
sd ta1,-32(a0)
|
|
sd ta0,-24(a0)
|
|
|
|
|
|
dmultu ta2,ta2
|
|
and ta0,a2,MINUS4
|
|
mflo ta3
|
|
mfhi ta2
|
|
sd ta3,-16(a0)
|
|
sd ta2,-8(a0)
|
|
|
|
.set noreorder
|
|
bgtzl ta0,.L_bn_sqr_words_loop
|
|
ld t0,0(a1)
|
|
|
|
bnezl a2,.L_bn_sqr_words_tail
|
|
ld t0,0(a1)
|
|
.set reorder
|
|
|
|
.L_bn_sqr_words_return:
|
|
move v0,zero
|
|
jr ra
|
|
|
|
.L_bn_sqr_words_tail:
|
|
dmultu t0,t0
|
|
subu a2,1
|
|
mflo t1
|
|
mfhi t0
|
|
sd t1,0(a0)
|
|
sd t0,8(a0)
|
|
beqz a2,.L_bn_sqr_words_return
|
|
|
|
ld t0,8(a1)
|
|
dmultu t0,t0
|
|
subu a2,1
|
|
mflo t1
|
|
mfhi t0
|
|
sd t1,16(a0)
|
|
sd t0,24(a0)
|
|
beqz a2,.L_bn_sqr_words_return
|
|
|
|
ld t0,16(a1)
|
|
dmultu t0,t0
|
|
mflo t1
|
|
mfhi t0
|
|
sd t1,32(a0)
|
|
sd t0,40(a0)
|
|
jr ra
|
|
END(bn_sqr_words)
|
|
|
|
.align 5
|
|
LEAF(bn_add_words)
|
|
.set noreorder
|
|
bgtzl a3,.L_bn_add_words_proceed
|
|
ld t0,0(a1)
|
|
jr ra
|
|
move v0,zero
|
|
.set reorder
|
|
|
|
.L_bn_add_words_proceed:
|
|
li MINUS4,-4
|
|
and AT,a3,MINUS4
|
|
move v0,zero
|
|
beqz AT,.L_bn_add_words_tail
|
|
|
|
.L_bn_add_words_loop:
|
|
ld ta0,0(a2)
|
|
subu a3,4
|
|
ld t1,8(a1)
|
|
and AT,a3,MINUS4
|
|
ld t2,16(a1)
|
|
PTR_ADD a2,32
|
|
ld t3,24(a1)
|
|
PTR_ADD a0,32
|
|
ld ta1,-24(a2)
|
|
PTR_ADD a1,32
|
|
ld ta2,-16(a2)
|
|
ld ta3,-8(a2)
|
|
daddu ta0,t0
|
|
sltu t8,ta0,t0
|
|
daddu t0,ta0,v0
|
|
sltu v0,t0,ta0
|
|
sd t0,-32(a0)
|
|
daddu v0,t8
|
|
|
|
daddu ta1,t1
|
|
sltu t9,ta1,t1
|
|
daddu t1,ta1,v0
|
|
sltu v0,t1,ta1
|
|
sd t1,-24(a0)
|
|
daddu v0,t9
|
|
|
|
daddu ta2,t2
|
|
sltu t8,ta2,t2
|
|
daddu t2,ta2,v0
|
|
sltu v0,t2,ta2
|
|
sd t2,-16(a0)
|
|
daddu v0,t8
|
|
|
|
daddu ta3,t3
|
|
sltu t9,ta3,t3
|
|
daddu t3,ta3,v0
|
|
sltu v0,t3,ta3
|
|
sd t3,-8(a0)
|
|
daddu v0,t9
|
|
|
|
.set noreorder
|
|
bgtzl AT,.L_bn_add_words_loop
|
|
ld t0,0(a1)
|
|
|
|
bnezl a3,.L_bn_add_words_tail
|
|
ld t0,0(a1)
|
|
.set reorder
|
|
|
|
.L_bn_add_words_return:
|
|
jr ra
|
|
|
|
.L_bn_add_words_tail:
|
|
ld ta0,0(a2)
|
|
daddu ta0,t0
|
|
subu a3,1
|
|
sltu t8,ta0,t0
|
|
daddu t0,ta0,v0
|
|
sltu v0,t0,ta0
|
|
sd t0,0(a0)
|
|
daddu v0,t8
|
|
beqz a3,.L_bn_add_words_return
|
|
|
|
ld t1,8(a1)
|
|
ld ta1,8(a2)
|
|
daddu ta1,t1
|
|
subu a3,1
|
|
sltu t9,ta1,t1
|
|
daddu t1,ta1,v0
|
|
sltu v0,t1,ta1
|
|
sd t1,8(a0)
|
|
daddu v0,t9
|
|
beqz a3,.L_bn_add_words_return
|
|
|
|
ld t2,16(a1)
|
|
ld ta2,16(a2)
|
|
daddu ta2,t2
|
|
sltu t8,ta2,t2
|
|
daddu t2,ta2,v0
|
|
sltu v0,t2,ta2
|
|
sd t2,16(a0)
|
|
daddu v0,t8
|
|
jr ra
|
|
END(bn_add_words)
|
|
|
|
.align 5
|
|
LEAF(bn_sub_words)
|
|
.set noreorder
|
|
bgtzl a3,.L_bn_sub_words_proceed
|
|
ld t0,0(a1)
|
|
jr ra
|
|
move v0,zero
|
|
.set reorder
|
|
|
|
.L_bn_sub_words_proceed:
|
|
li MINUS4,-4
|
|
and AT,a3,MINUS4
|
|
move v0,zero
|
|
beqz AT,.L_bn_sub_words_tail
|
|
|
|
.L_bn_sub_words_loop:
|
|
ld ta0,0(a2)
|
|
subu a3,4
|
|
ld t1,8(a1)
|
|
and AT,a3,MINUS4
|
|
ld t2,16(a1)
|
|
PTR_ADD a2,32
|
|
ld t3,24(a1)
|
|
PTR_ADD a0,32
|
|
ld ta1,-24(a2)
|
|
PTR_ADD a1,32
|
|
ld ta2,-16(a2)
|
|
ld ta3,-8(a2)
|
|
sltu t8,t0,ta0
|
|
dsubu t0,ta0
|
|
dsubu ta0,t0,v0
|
|
sd ta0,-32(a0)
|
|
MOVNZ (t0,v0,t8)
|
|
|
|
sltu t9,t1,ta1
|
|
dsubu t1,ta1
|
|
dsubu ta1,t1,v0
|
|
sd ta1,-24(a0)
|
|
MOVNZ (t1,v0,t9)
|
|
|
|
|
|
sltu t8,t2,ta2
|
|
dsubu t2,ta2
|
|
dsubu ta2,t2,v0
|
|
sd ta2,-16(a0)
|
|
MOVNZ (t2,v0,t8)
|
|
|
|
sltu t9,t3,ta3
|
|
dsubu t3,ta3
|
|
dsubu ta3,t3,v0
|
|
sd ta3,-8(a0)
|
|
MOVNZ (t3,v0,t9)
|
|
|
|
.set noreorder
|
|
bgtzl AT,.L_bn_sub_words_loop
|
|
ld t0,0(a1)
|
|
|
|
bnezl a3,.L_bn_sub_words_tail
|
|
ld t0,0(a1)
|
|
.set reorder
|
|
|
|
.L_bn_sub_words_return:
|
|
jr ra
|
|
|
|
.L_bn_sub_words_tail:
|
|
ld ta0,0(a2)
|
|
subu a3,1
|
|
sltu t8,t0,ta0
|
|
dsubu t0,ta0
|
|
dsubu ta0,t0,v0
|
|
MOVNZ (t0,v0,t8)
|
|
sd ta0,0(a0)
|
|
beqz a3,.L_bn_sub_words_return
|
|
|
|
ld t1,8(a1)
|
|
subu a3,1
|
|
ld ta1,8(a2)
|
|
sltu t9,t1,ta1
|
|
dsubu t1,ta1
|
|
dsubu ta1,t1,v0
|
|
MOVNZ (t1,v0,t9)
|
|
sd ta1,8(a0)
|
|
beqz a3,.L_bn_sub_words_return
|
|
|
|
ld t2,16(a1)
|
|
ld ta2,16(a2)
|
|
sltu t8,t2,ta2
|
|
dsubu t2,ta2
|
|
dsubu ta2,t2,v0
|
|
MOVNZ (t2,v0,t8)
|
|
sd ta2,16(a0)
|
|
jr ra
|
|
END(bn_sub_words)
|
|
|
|
#undef MINUS4
|
|
|
|
.align 5
|
|
LEAF(bn_div_3_words)
|
|
.set reorder
|
|
move a3,a0 /* we know that bn_div_words doesn't
|
|
* touch a3, ta2, ta3 and preserves a2
|
|
* so that we can save two arguments
|
|
* and return address in registers
|
|
* instead of stack:-)
|
|
*/
|
|
ld a0,(a3)
|
|
move ta2,a1
|
|
ld a1,-8(a3)
|
|
bne a0,a2,.L_bn_div_3_words_proceed
|
|
li v0,-1
|
|
jr ra
|
|
.L_bn_div_3_words_proceed:
|
|
move ta3,ra
|
|
bal bn_div_words
|
|
move ra,ta3
|
|
dmultu ta2,v0
|
|
ld t2,-16(a3)
|
|
move ta0,zero
|
|
mfhi t1
|
|
mflo t0
|
|
sltu t8,t1,v1
|
|
.L_bn_div_3_words_inner_loop:
|
|
bnez t8,.L_bn_div_3_words_inner_loop_done
|
|
sgeu AT,t2,t0
|
|
seq t9,t1,v1
|
|
and AT,t9
|
|
sltu t3,t0,ta2
|
|
daddu v1,a2
|
|
dsubu t1,t3
|
|
dsubu t0,ta2
|
|
sltu t8,t1,v1
|
|
sltu ta0,v1,a2
|
|
or t8,ta0
|
|
.set noreorder
|
|
beqzl AT,.L_bn_div_3_words_inner_loop
|
|
dsubu v0,1
|
|
.set reorder
|
|
.L_bn_div_3_words_inner_loop_done:
|
|
jr ra
|
|
END(bn_div_3_words)
|
|
|
|
.align 5
|
|
LEAF(bn_div_words)
|
|
.set noreorder
|
|
bnezl a2,.L_bn_div_words_proceed
|
|
move v1,zero
|
|
jr ra
|
|
li v0,-1 /* I'd rather signal div-by-zero
|
|
* which can be done with 'break 7' */
|
|
|
|
.L_bn_div_words_proceed:
|
|
bltz a2,.L_bn_div_words_body
|
|
move t9,v1
|
|
dsll a2,1
|
|
bgtz a2,.-4
|
|
addu t9,1
|
|
|
|
.set reorder
|
|
negu t1,t9
|
|
li t2,-1
|
|
dsll t2,t1
|
|
and t2,a0
|
|
dsrl AT,a1,t1
|
|
.set noreorder
|
|
bnezl t2,.+8
|
|
break 6 /* signal overflow */
|
|
.set reorder
|
|
dsll a0,t9
|
|
dsll a1,t9
|
|
or a0,AT
|
|
|
|
#define QT ta0
|
|
#define HH ta1
|
|
#define DH v1
|
|
.L_bn_div_words_body:
|
|
dsrl DH,a2,32
|
|
sgeu AT,a0,a2
|
|
.set noreorder
|
|
bnezl AT,.+8
|
|
dsubu a0,a2
|
|
.set reorder
|
|
|
|
li QT,-1
|
|
dsrl HH,a0,32
|
|
dsrl QT,32 /* q=0xffffffff */
|
|
beq DH,HH,.L_bn_div_words_skip_div1
|
|
ddivu zero,a0,DH
|
|
mflo QT
|
|
.L_bn_div_words_skip_div1:
|
|
dmultu a2,QT
|
|
dsll t3,a0,32
|
|
dsrl AT,a1,32
|
|
or t3,AT
|
|
mflo t0
|
|
mfhi t1
|
|
.L_bn_div_words_inner_loop1:
|
|
sltu t2,t3,t0
|
|
seq t8,HH,t1
|
|
sltu AT,HH,t1
|
|
and t2,t8
|
|
sltu v0,t0,a2
|
|
or AT,t2
|
|
.set noreorder
|
|
beqz AT,.L_bn_div_words_inner_loop1_done
|
|
dsubu t1,v0
|
|
dsubu t0,a2
|
|
b .L_bn_div_words_inner_loop1
|
|
dsubu QT,1
|
|
.set reorder
|
|
.L_bn_div_words_inner_loop1_done:
|
|
|
|
dsll a1,32
|
|
dsubu a0,t3,t0
|
|
dsll v0,QT,32
|
|
|
|
li QT,-1
|
|
dsrl HH,a0,32
|
|
dsrl QT,32 /* q=0xffffffff */
|
|
beq DH,HH,.L_bn_div_words_skip_div2
|
|
ddivu zero,a0,DH
|
|
mflo QT
|
|
.L_bn_div_words_skip_div2:
|
|
#undef DH
|
|
dmultu a2,QT
|
|
dsll t3,a0,32
|
|
dsrl AT,a1,32
|
|
or t3,AT
|
|
mflo t0
|
|
mfhi t1
|
|
.L_bn_div_words_inner_loop2:
|
|
sltu t2,t3,t0
|
|
seq t8,HH,t1
|
|
sltu AT,HH,t1
|
|
and t2,t8
|
|
sltu v1,t0,a2
|
|
or AT,t2
|
|
.set noreorder
|
|
beqz AT,.L_bn_div_words_inner_loop2_done
|
|
dsubu t1,v1
|
|
dsubu t0,a2
|
|
b .L_bn_div_words_inner_loop2
|
|
dsubu QT,1
|
|
.set reorder
|
|
.L_bn_div_words_inner_loop2_done:
|
|
#undef HH
|
|
|
|
dsubu a0,t3,t0
|
|
or v0,QT
|
|
dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */
|
|
dsrl a2,t9 /* restore a2 */
|
|
jr ra
|
|
#undef QT
|
|
END(bn_div_words)
|
|
|
|
#define a_0 t0
|
|
#define a_1 t1
|
|
#define a_2 t2
|
|
#define a_3 t3
|
|
#define b_0 ta0
|
|
#define b_1 ta1
|
|
#define b_2 ta2
|
|
#define b_3 ta3
|
|
|
|
#define a_4 s0
|
|
#define a_5 s2
|
|
#define a_6 s4
|
|
#define a_7 a1 /* once we load a[7] we don't need a anymore */
|
|
#define b_4 s1
|
|
#define b_5 s3
|
|
#define b_6 s5
|
|
#define b_7 a2 /* once we load b[7] we don't need b anymore */
|
|
|
|
#define t_1 t8
|
|
#define t_2 t9
|
|
|
|
#define c_1 v0
|
|
#define c_2 v1
|
|
#define c_3 a3
|
|
|
|
#define FRAME_SIZE 48
|
|
|
|
.align 5
|
|
LEAF(bn_mul_comba8)
|
|
.set noreorder
|
|
PTR_SUB sp,FRAME_SIZE
|
|
.frame sp,64,ra
|
|
.set reorder
|
|
ld a_0,0(a1) /* If compiled with -mips3 option on
|
|
* R5000 box assembler barks on this
|
|
* line with "shouldn't have mult/div
|
|
* as last instruction in bb (R10K
|
|
* bug)" warning. If anybody out there
|
|
* has a clue about how to circumvent
|
|
* this do send me a note.
|
|
* <appro@fy.chalmers.se>
|
|
*/
|
|
ld b_0,0(a2)
|
|
ld a_1,8(a1)
|
|
ld a_2,16(a1)
|
|
ld a_3,24(a1)
|
|
ld b_1,8(a2)
|
|
ld b_2,16(a2)
|
|
ld b_3,24(a2)
|
|
dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
|
|
sd s0,0(sp)
|
|
sd s1,8(sp)
|
|
sd s2,16(sp)
|
|
sd s3,24(sp)
|
|
sd s4,32(sp)
|
|
sd s5,40(sp)
|
|
mflo c_1
|
|
mfhi c_2
|
|
|
|
dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
|
|
ld a_4,32(a1)
|
|
ld a_5,40(a1)
|
|
ld a_6,48(a1)
|
|
ld a_7,56(a1)
|
|
ld b_4,32(a2)
|
|
ld b_5,40(a2)
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu c_3,t_2,AT
|
|
dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
|
|
ld b_6,48(a2)
|
|
ld b_7,56(a2)
|
|
sd c_1,0(a0) /* r[0]=c1; */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu c_1,c_3,t_2
|
|
sd c_2,8(a0) /* r[1]=c2; */
|
|
|
|
dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu c_2,c_1,t_2
|
|
dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,16(a0) /* r[2]=c3; */
|
|
|
|
dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu c_3,c_2,t_2
|
|
dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
sd c_1,24(a0) /* r[3]=c1; */
|
|
|
|
dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu c_1,c_3,t_2
|
|
dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
sd c_2,32(a0) /* r[4]=c2; */
|
|
|
|
dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu c_2,c_1,t_2
|
|
dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,40(a0) /* r[5]=c3; */
|
|
|
|
dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu c_3,c_2,t_2
|
|
dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
sd c_1,48(a0) /* r[6]=c1; */
|
|
|
|
dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu c_1,c_3,t_2
|
|
dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
sd c_2,56(a0) /* r[7]=c2; */
|
|
|
|
dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu c_2,c_1,t_2
|
|
dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,64(a0) /* r[8]=c3; */
|
|
|
|
dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu c_3,c_2,t_2
|
|
dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
sd c_1,72(a0) /* r[9]=c1; */
|
|
|
|
dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu c_1,c_3,t_2
|
|
dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
sd c_2,80(a0) /* r[10]=c2; */
|
|
|
|
dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu c_2,c_1,t_2
|
|
dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,88(a0) /* r[11]=c3; */
|
|
|
|
dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu c_3,c_2,t_2
|
|
dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
sd c_1,96(a0) /* r[12]=c1; */
|
|
|
|
dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu c_1,c_3,t_2
|
|
dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
sd c_2,104(a0) /* r[13]=c2; */
|
|
|
|
dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
|
|
ld s0,0(sp)
|
|
ld s1,8(sp)
|
|
ld s2,16(sp)
|
|
ld s3,24(sp)
|
|
ld s4,32(sp)
|
|
ld s5,40(sp)
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sd c_3,112(a0) /* r[14]=c3; */
|
|
sd c_1,120(a0) /* r[15]=c1; */
|
|
|
|
PTR_ADD sp,FRAME_SIZE
|
|
|
|
jr ra
|
|
END(bn_mul_comba8)
|
|
|
|
.align 5
|
|
LEAF(bn_mul_comba4)
|
|
.set reorder
|
|
ld a_0,0(a1)
|
|
ld b_0,0(a2)
|
|
ld a_1,8(a1)
|
|
ld a_2,16(a1)
|
|
dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
|
|
ld a_3,24(a1)
|
|
ld b_1,8(a2)
|
|
ld b_2,16(a2)
|
|
ld b_3,24(a2)
|
|
mflo c_1
|
|
mfhi c_2
|
|
sd c_1,0(a0)
|
|
|
|
dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu c_3,t_2,AT
|
|
dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu c_1,c_3,t_2
|
|
sd c_2,8(a0)
|
|
|
|
dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu c_2,c_1,t_2
|
|
dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,16(a0)
|
|
|
|
dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu c_3,c_2,t_2
|
|
dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
sd c_1,24(a0)
|
|
|
|
dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu c_1,c_3,t_2
|
|
dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
sd c_2,32(a0)
|
|
|
|
dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu c_2,c_1,t_2
|
|
dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,40(a0)
|
|
|
|
dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sd c_1,48(a0)
|
|
sd c_2,56(a0)
|
|
|
|
jr ra
|
|
END(bn_mul_comba4)
|
|
|
|
#undef a_4
|
|
#undef a_5
|
|
#undef a_6
|
|
#undef a_7
|
|
#define a_4 b_0
|
|
#define a_5 b_1
|
|
#define a_6 b_2
|
|
#define a_7 b_3
|
|
|
|
.align 5
|
|
LEAF(bn_sqr_comba8)
|
|
.set reorder
|
|
ld a_0,0(a1)
|
|
ld a_1,8(a1)
|
|
ld a_2,16(a1)
|
|
ld a_3,24(a1)
|
|
|
|
dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
|
|
ld a_4,32(a1)
|
|
ld a_5,40(a1)
|
|
ld a_6,48(a1)
|
|
ld a_7,56(a1)
|
|
mflo c_1
|
|
mfhi c_2
|
|
sd c_1,0(a0)
|
|
|
|
dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_1,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu c_3,t_2,AT
|
|
sd c_2,8(a0)
|
|
|
|
dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_2,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,16(a0)
|
|
|
|
dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_3,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_3,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
sd c_1,24(a0)
|
|
|
|
dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_1,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_1,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
sd c_2,32(a0)
|
|
|
|
dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_2,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_2,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_2,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,40(a0)
|
|
|
|
dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_3,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_3,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_3,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
sd c_1,48(a0)
|
|
|
|
dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_1,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_1,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_1,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_1,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
sd c_2,56(a0)
|
|
|
|
dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_2,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_2,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_2,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,64(a0)
|
|
|
|
dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_3,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_3,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_3,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
sd c_1,72(a0)
|
|
|
|
dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_1,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_1,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
sd c_2,80(a0)
|
|
|
|
dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_2,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_2,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,88(a0)
|
|
|
|
dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_3,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
sd c_1,96(a0)
|
|
|
|
dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_1,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
sd c_2,104(a0)
|
|
|
|
dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sd c_3,112(a0)
|
|
sd c_1,120(a0)
|
|
|
|
jr ra
|
|
END(bn_sqr_comba8)
|
|
|
|
.align 5
|
|
LEAF(bn_sqr_comba4)
|
|
.set reorder
|
|
ld a_0,0(a1)
|
|
ld a_1,8(a1)
|
|
ld a_2,16(a1)
|
|
ld a_3,24(a1)
|
|
dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */
|
|
mflo c_1
|
|
mfhi c_2
|
|
sd c_1,0(a0)
|
|
|
|
dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_1,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu c_3,t_2,AT
|
|
sd c_2,8(a0)
|
|
|
|
dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_2,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,16(a0)
|
|
|
|
dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_3,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt AT,t_2,zero
|
|
daddu c_3,AT
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sltu AT,c_2,t_2
|
|
daddu c_3,AT
|
|
sd c_1,24(a0)
|
|
|
|
dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_1,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_2,t_1
|
|
sltu AT,c_2,t_1
|
|
daddu t_2,AT
|
|
daddu c_3,t_2
|
|
sltu AT,c_3,t_2
|
|
daddu c_1,AT
|
|
sd c_2,32(a0)
|
|
|
|
dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
slt c_2,t_2,zero
|
|
dsll t_2,1
|
|
slt a2,t_1,zero
|
|
daddu t_2,a2
|
|
dsll t_1,1
|
|
daddu c_3,t_1
|
|
sltu AT,c_3,t_1
|
|
daddu t_2,AT
|
|
daddu c_1,t_2
|
|
sltu AT,c_1,t_2
|
|
daddu c_2,AT
|
|
sd c_3,40(a0)
|
|
|
|
dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */
|
|
mflo t_1
|
|
mfhi t_2
|
|
daddu c_1,t_1
|
|
sltu AT,c_1,t_1
|
|
daddu t_2,AT
|
|
daddu c_2,t_2
|
|
sd c_1,48(a0)
|
|
sd c_2,56(a0)
|
|
|
|
jr ra
|
|
END(bn_sqr_comba4)
|