bn/asm/x86_64-mont.pl: minor optimization [for Decoded ICache].

This commit is contained in:
Andy Polyakov 2013-10-25 10:12:17 +02:00
parent ed77017b59
commit 4eeb750d20

View file

@ -1685,6 +1685,7 @@ bn_mulx4x_mont:
push %r15
shl \$3,${num}d # convert $num to bytes
.byte 0x67
xor %r10,%r10
mov %rsp,%r11 # put aside %rsp
sub $num,%r10 # -$num
@ -1725,15 +1726,14 @@ $code.=<<___;
mov ($bp),%rdx # b[0], $bp==%rdx actually
lea 64+32(%rsp),$tptr
mov %rdx,$bi
xor $zero,$zero # of=0,cf=0
mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
adcx %rax,%r11
add %rax,%r11
mov $bptr,8(%rsp) # off-load &b[i]
mulx 2*8($aptr),%r12,%r13 # ...
adcx %r14,%r12
adcx $zero,%r13
adc %r14,%r12
adc \$0,%r13
mov $mi,$bptr # borrow $bptr
imulq 24(%rsp),$mi # "t[0]"*n0
@ -1751,13 +1751,12 @@ $code.=<<___;
mulx 1*8($nptr),%rax,%r11
adcx %rax,%r10
adox %r12,%r11
mulx 2*8($nptr),%rax,%r12
.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
mov 48(%rsp),$bptr # counter value
mov %r10,-4*8($tptr)
adcx %rax,%r11
adox %r13,%r12
mulx 3*8($nptr),%rax,%r15
.byte 0x66,0x66
mov $bi,%rdx
mov %r11,-3*8($tptr)
adcx %rax,%r12
@ -1765,7 +1764,7 @@ $code.=<<___;
lea 4*8($nptr),$nptr
mov %r12,-2*8($tptr)
#jmp .Lmulx4x_1st
jmp .Lmulx4x_1st
.align 32
.Lmulx4x_1st:
@ -1863,7 +1862,6 @@ $code.=<<___;
adox %r12,%r11
mulx 2*8($nptr),%rax,%r12
mov %r10,-4*8($tptr)
mov 0*8($tptr),%r10
adcx %rax,%r11
adox %r13,%r12
mulx 3*8($nptr),%rax,%r15
@ -1872,23 +1870,22 @@ $code.=<<___;
adcx %rax,%r12
adox $zero,%r15 # of=0
mov 48(%rsp),$bptr # counter value
.byte 0x66,0x3e
mov %r12,-2*8($tptr)
.byte 0x66
lea 4*8($nptr),$nptr
jmp .Lmulx4x_inner
#jmp .Lmulx4x_inner
.align 32
.Lmulx4x_inner:
adcx $zero,%r15 # cf=0, modulo-scheduled
adox %r10,%r14
adox 0*8($tptr),%r14
mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
mov 1*8($tptr),%r13
adcx %r14,%r10
mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
adox %rax,%r11
mulx 2*8($aptr),%r12,%rax # ...
adcx %r13,%r11
adcx 1*8($tptr),%r11
adox %r14,%r12
mulx 3*8($aptr),%r13,%r14
mov $mi,%rdx
@ -1896,8 +1893,8 @@ $code.=<<___;
adox %rax,%r13
adcx 3*8($tptr),%r13
adox $zero,%r14 # of=0
.byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr
.byte 0x48,0x8d,0x9b,0x20,0x00,0x00,0x00 # lea 4*8($tptr),$tptr
lea 4*8($aptr),$aptr
lea 4*8($tptr),$tptr
adcx $zero,%r14 # cf=0
adox %r15,%r10
@ -1909,7 +1906,6 @@ $code.=<<___;
adox %r15,%r12
mulx 2*8($nptr),%rax,%r15
mov %r10,-5*8($tptr)
mov 0*8($tptr),%r10
adcx %rax,%r12
adox %r15,%r13
mulx 3*8($nptr),%rax,%r15
@ -1927,7 +1923,7 @@ $code.=<<___;
mov 0(%rsp),$num # load num
mov 8(%rsp),$bptr # re-load &b[i]
adc $zero,%r15 # modulo-scheduled
sub %r10,$zero # pull top-most carry
sub 0*8($tptr),$zero # pull top-most carry
adc %r15,%r14
sbb %r15,%r15 # top-most carry
mov %r14,-1*8($tptr)
@ -1936,10 +1932,10 @@ $code.=<<___;
jne .Lmulx4x_outer
neg $num
xor %rdx,%rdx
mov 32(%rsp),$rptr # restore rp
lea 64(%rsp),$tptr
xor %rdx,%rdx
pxor %xmm0,%xmm0
mov 0*8($nptr,$num),%r8
mov 1*8($nptr,$num),%r9
@ -2022,6 +2018,7 @@ bn_sqrx8x_mont:
push %r15
shl \$3,${num}d # convert $num to bytes
.byte 0x67
xor %r10,%r10
mov %rsp,%r11 # put aside %rsp
sub $num,%r10 # -$num
@ -2043,6 +2040,12 @@ bn_sqrx8x_mont:
movq %r10, %xmm3 # -$num
movq %r11, %xmm4 # save original %rsp
mov $n0, 32(%rsp)
___
$code.=<<___ if ($win64);
jmp .Lsqrx8x_body
.align 32
___
$code.=<<___;
.Lsqrx8x_body:
##################################################################
# Squaring part:
@ -2096,12 +2099,15 @@ $code.=<<___;
mov $aaptr,8(%rsp) # save end of $aptr
jmp .Lsqr8x_zero_start
.align 32
.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
.Lsqrx8x_zero:
.byte 0x3e
movdqa %xmm0,0*8($tptr)
movdqa %xmm0,2*8($tptr)
movdqa %xmm0,4*8($tptr)
movdqa %xmm0,6*8($tptr)
.Lsqr8x_zero_start:
.Lsqr8x_zero_start: # aligned at 32
movdqa %xmm0,8*8($tptr)
movdqa %xmm0,10*8($tptr)
movdqa %xmm0,12*8($tptr)
@ -2111,47 +2117,47 @@ $code.=<<___;
jnz .Lsqrx8x_zero
mov 0*8($aptr),%rdx # a[0], modulo-scheduled
xor %r8,%r8
xor %r9,%r9
#xor %r9,%r9 # t[1], ex-$num, zero already
xor %r10,%r10
xor %r11,%r11
xor %r12,%r12
xor %r13,%r13
xor %r14,%r14
xor %r15,%r15
lea 48(%rsp),$tptr
xor $zero,$zero # cf=0, cf=0
jmp .Lsqrx8x_outer_loop
.align 32
.Lsqrx8x_outer_loop:
mulx 1*8($aptr),%rax,%rbx # a[1]*a[0]
adcx %rax,%r8 # a[1]*a[0]+=t[1]
adox %rbx,%r9
mulx 2*8($aptr),%rax,%rbx # a[2]*a[0]
adcx %rax,%r9
adox %rbx,%r10
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%rax,%rbx # ...
adcx %rax,%r10
adox %rbx,%r11
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%rax,%rbx
adcx %rax,%r11
adox %rbx,%r12
mulx 5*8($aptr),%rax,%rbx
adcx %rax,%r12
adox %rbx,%r13
mulx 6*8($aptr),%rax,%rbx
adcx %rax,%r13
adox %rbx,%r14
mulx 7*8($aptr),%rax,%r15
mulx 1*8($aptr),%r8,%rax # a[1]*a[0]
adcx %r9,%r8 # a[1]*a[0]+=t[1]
adox %rax,%r10
mulx 2*8($aptr),%r9,%rax # a[2]*a[0]
adcx %r10,%r9
adox %rax,%r11
.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ...
adcx %r11,%r10
adox %rax,%r12
.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax
adcx %r12,%r11
adox %rax,%r13
mulx 5*8($aptr),%r12,%rax
adcx %r13,%r12
adox %rax,%r14
mulx 6*8($aptr),%r13,%rax
adcx %r14,%r13
adox %r15,%rax
mulx 7*8($aptr),%r14,%r15
mov 1*8($aptr),%rdx # a[1]
adcx %rax,%r14
adox $zero,%r15
adc 8*8($tptr),%r15
mov %r8,1*8($tptr) # t[1]
mov %r9,2*8($tptr) # t[2]
sbb $carry,$carry # mov %cf,$carry
xor $zero,$zero # cf=0, of=0
mov %r8,1*8($tptr) # t[1]
mov %r9,2*8($tptr) # t[2]
mulx 2*8($aptr),%r8,%rbx # a[2]*a[1]
mulx 3*8($aptr),%r9,%rax # a[3]*a[1]
@ -2193,13 +2199,12 @@ $code.=<<___;
adcx %rbx,%r11
adox %rax,%r12
adcx %r14,%r12
mov %r8,5*8($tptr) # t[5]
mov %r9,6*8($tptr) # t[6]
mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
adox $zero,%r13 # of=0
adcx $zero,%r13 # cf=0
mov %r8,5*8($tptr) # t[5]
mov %r9,6*8($tptr) # t[6]
mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
mulx 5*8($aptr),%r9,%rbx # a[5]*a[3]
adcx %r10,%r8
adox %rax,%r9
@ -2239,9 +2244,9 @@ $code.=<<___;
adcx %r14,%r11
adox %rbx,%r12
adcx %rax,%r12
.byte 0x66,0x66
adox $zero,%r13
.byte 0x67,0x67
mulx %r8,%r8,%r14 # a[7]*a[6]
adcx %r8,%r13
adcx $zero,%r14
@ -2250,26 +2255,26 @@ $code.=<<___;
je .Lsqrx8x_outer_break
neg $carry # mov $carry,%cf
mov \$-8,%rcx
mov $zero,%r15
mov 8*8($tptr),%r8
adc 9*8($tptr),%r9 # +=t[9]
adc 10*8($tptr),%r10 # ...
adc 11*8($tptr),%r11
adcx 9*8($tptr),%r9 # +=t[9]
adcx 10*8($tptr),%r10 # ...
adcx 11*8($tptr),%r11
adc 12*8($tptr),%r12
adc 13*8($tptr),%r13
adc 14*8($tptr),%r14
adc 15*8($tptr),%r15
lea 8*8($tptr),$tptr
sbb $carry,$carry # mov %cf,$carry
lea ($aptr),$aaptr
lea 2*8*8($tptr),$tptr
sbb %rax,%rax # mov %cf,$carry
mov -64($aptr),%rdx # a[0]
lea ($aptr),$aaptr
mov $carry,16(%rsp) # offload $carry
mov %rax,16(%rsp) # offload $carry
mov $tptr,24(%rsp)
lea 8*8($tptr),$tptr
#lea 8*8($tptr),$tptr # see 2*8*8($tptr) above
xor %eax,%eax # cf=0, of=0
mov \$-8,%rcx
jmp .Lsqrx8x_loop
.align 32
@ -2311,17 +2316,20 @@ $code.=<<___;
adox %rbx,%r15 # %rbx is 0, of=0
adcx %rbx,%r15 # cf=0
.byte 0x67
inc %rcx # of=0
jnz .Lsqrx8x_loop
lea 8*8($aaptr),$aaptr
mov \$-8,%rcx
cmp 8(%rsp),$aaptr # done?
je .Lsqrx8x_break
sub 16(%rsp),%rbx # mov 16(%rsp),%cf
.byte 0x66
mov -64($aptr),%rdx
adc 0*8($tptr),%r8
adc 1*8($tptr),%r9
adcx 0*8($tptr),%r8
adcx 1*8($tptr),%r9
adc 2*8($tptr),%r10
adc 3*8($tptr),%r11
adc 4*8($tptr),%r12
@ -2329,35 +2337,37 @@ $code.=<<___;
adc 6*8($tptr),%r14
adc 7*8($tptr),%r15
lea 8*8($tptr),$tptr
sbb %rbx,%rbx # mov %cf,%rbx
xor %eax,%eax # cf=0, of=0
mov %rbx,16(%rsp) # offload carry
mov \$-8,%rcx
.byte 0x67
sbb %rax,%rax # mov %cf,%rax
xor %ebx,%ebx # cf=0, of=0
mov %rax,16(%rsp) # offload carry
jmp .Lsqrx8x_loop
.align 32
.Lsqrx8x_break:
sub 16(%rsp),%r8 # consume last carry
mov 24(%rsp),$aaptr # initial $tptr
mov 24(%rsp),$carry # initial $tptr, borrow $carry
mov 0*8($aptr),%rdx # a[8], modulo-scheduled
xor %ebp,%ebp # xor $zero,$zero
mov %r8,0*8($tptr)
lea 8*8($aaptr),$aaptr
cmp $carry,$tptr # cf=0, of=0
je .Lsqrx8x_outer_loop
mov %r9,1*8($tptr)
mov 1*8($aaptr),%r8 # potentially forwarded store
mov 1*8($carry),%r9
mov %r10,2*8($tptr)
mov 2*8($aaptr),%r9 # ...
mov 2*8($carry),%r10
mov %r11,3*8($tptr)
mov 3*8($aaptr),%r10
mov 3*8($carry),%r11
mov %r12,4*8($tptr)
mov 4*8($aaptr),%r11
mov 4*8($carry),%r12
mov %r13,5*8($tptr)
mov 5*8($aaptr),%r12
mov 5*8($carry),%r13
mov %r14,6*8($tptr)
mov 6*8($aaptr),%r13
mov 6*8($carry),%r14
mov %r15,7*8($tptr)
mov 7*8($aaptr),%r14
mov $aaptr,$tptr
xor $zero,$zero # cf=0, cf=0
mov 7*8($carry),%r15
mov $carry,$tptr
jmp .Lsqrx8x_outer_loop
.align 32
@ -2373,13 +2383,12 @@ ___
} {
my $i="%rcx";
$code.=<<___;
mov (%rsp),$num # restore $num
lea 48(%rsp),$tptr
mov ($aptr,$i),%rdx # a[0]
mov 8($tptr),$A0[1] # t[1]
xor $A0[0],$A0[0] # t[0], of=0, cf=0
mov (%rsp),$num # restore $num
adox $A0[1],$A0[1]
mov 16($tptr),$A1[0] # t[2] # prefetch
mov 24($tptr),$A1[1] # t[3] # prefetch
@ -2440,9 +2449,9 @@ $code.=<<___;
.align 32
.Lsqrx4x_shift_n_add_break:
adcx $A1[1],%rbx
.byte 0x48,0x89,0x87,0x30,0x00,0x00,0x00 # mov %rax,48($tptr)
.byte 0x48,0x89,0x9f,0x38,0x00,0x00,0x00 # mov %rbx,56($tptr)
.byte 0x48,0x8d,0xbf,0x40,0x00,0x00,0x00 # lea 64($tptr),$tptr
mov %rax,48($tptr)
mov %rbx,56($tptr)
lea 64($tptr),$tptr # end of t[] buffer
___
}
######################################################################
@ -2456,17 +2465,16 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
$code.=<<___;
movq %xmm2,$nptr
xor %eax,%eax # initial top-most carry bit
mov 32(%rsp),%rbx # n0
mov 48(%rsp),%rdx # "%r8", 8*0($tptr)
lea ($nptr,$num),%rax # end of n[]
lea -64($nptr,$num),%rcx # end of n[]
#lea 48(%rsp,$num,2),$tptr # end of t[] buffer
mov %rax, 0(%rsp) # save end of n[]
mov %rcx, 0(%rsp) # save end of n[]
mov $tptr,8(%rsp) # save end of t[]
lea 48(%rsp),$tptr # initial t[] window
xor %rax,%rax
nop
#jmp .Lsqrx8x_reduction_loop
jmp .Lsqrx8x_reduction_loop
.align 32
.Lsqrx8x_reduction_loop:
@ -2529,29 +2537,31 @@ $code.=<<___;
adox $carry,%r15 # $carry is 0
adcx $carry,%r15 # cf=0
.byte 0x67
inc %rcx # of=0
jnz .Lsqrx8x_reduce
lea 8*8($nptr),$nptr
xor %rax,%rax
.byte 0x66,0x67
mov $carry,%rax # xor %rax,%rax
cmp 0(%rsp),$nptr # end of n[]?
jae .Lsqrx8x_no_tail
mov 48(%rsp),%rdx # pull n0*a[0]
add 8*0($tptr),%r8
adcx 8*1($tptr),%r9
adcx 8*2($tptr),%r10
adcx 8*3($tptr),%r11
adcx 8*4($tptr),%r12
adcx 8*5($tptr),%r13
adcx 8*6($tptr),%r14
adcx 8*7($tptr),%r15
lea 8*8($tptr),$tptr
sbb $carry,$carry # top carry
lea 8*8($nptr),$nptr
mov \$-8,%rcx
mov $carry,16(%rsp)
adc 8*1($tptr),%r9
adc 8*2($tptr),%r10
adc 8*3($tptr),%r11
adc 8*4($tptr),%r12
adc 8*5($tptr),%r13
adc 8*6($tptr),%r14
adc 8*7($tptr),%r15
lea 8*8($tptr),$tptr
sbb %rax,%rax # top carry
xor $carry,$carry # of=0, cf=0
mov %rax,16(%rsp)
jmp .Lsqrx8x_tail
.align 32
@ -2588,7 +2598,7 @@ $code.=<<___;
mulx 8*7($nptr),%rax,%r15
mov 48+72(%rsp,%rcx,8),%rdx # pull n0*a[i]
adcx %rax,%r14
.byte 0x66
.byte 0x67
adox $carry,%r15
mov %rbx,($tptr,%rcx,8) # save result
mov %r8,%rbx
@ -2597,35 +2607,35 @@ $code.=<<___;
inc %rcx # of=0
jnz .Lsqrx8x_tail
lea 8*8($nptr),$nptr
cmp 0(%rsp),$nptr # end of n[]?
jae .Lsqrx8x_tail_done # break out of loop
sub 16(%rsp),$carry # neg $carry
sub 16(%rsp),$carry # mov 16(%rsp),%cf
mov 48(%rsp),%rdx # pull n0*a[0]
adcx 8*0($tptr),%r8
adcx 8*1($tptr),%r9
adcx 8*2($tptr),%r10
adcx 8*3($tptr),%r11
adcx 8*4($tptr),%r12
adcx 8*5($tptr),%r13
adcx 8*6($tptr),%r14
adcx 8*7($tptr),%r15
lea 8*8($nptr),$nptr
adc 8*0($tptr),%r8
adc 8*1($tptr),%r9
adc 8*2($tptr),%r10
adc 8*3($tptr),%r11
adc 8*4($tptr),%r12
adc 8*5($tptr),%r13
adc 8*6($tptr),%r14
adc 8*7($tptr),%r15
lea 8*8($tptr),$tptr
sbb $carry,$carry
mov \$-8,%rcx
mov $carry,16(%rsp)
sbb %rax,%rax
xor $carry,$carry # of=0, cf=0
mov %rax,16(%rsp)
jmp .Lsqrx8x_tail
.align 32
.Lsqrx8x_tail_done:
add 24(%rsp),%r8 # can this overflow?
xor %rax,%rax
mov $carry,%rax # xor %rax,%rax
sub 16(%rsp),$carry # neg $carry
.Lsqrx8x_no_tail: # carry flag is 0
sub 16(%rsp),$carry # mov 16(%rsp),%cf
.Lsqrx8x_no_tail: # %cf is 0 if jumped here
adc 8*0($tptr),%r8
movq %xmm3,%rcx
adc 8*1($tptr),%r9
@ -2639,24 +2649,24 @@ $code.=<<___;
adc 8*7($tptr),%r15
adc %rax,%rax # top-most carry
cmp 8(%rsp),$carry # end of t[]?
mov 32(%rsp),%rbx # n0
mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8"
lea 8*8($tptr,%rcx),$tptr # start of current t[] window
mov %r8,-8*8($carry) # store top 512 bits
mov %r9,-8*7($carry)
mov %r10,-8*6($carry)
mov %r11,-8*5($carry)
mov %r12,-8*4($carry)
mov %r13,-8*3($carry)
mov %r14,-8*2($carry)
mov %r15,-8*1($carry)
mov %r8,8*0($tptr) # store top 512 bits
mov %r9,8*1($tptr)
mov %r10,8*2($tptr)
mov %r11,8*3($tptr)
mov %r12,8*4($tptr)
mov %r13,8*5($tptr)
mov %r14,8*6($tptr)
mov %r15,8*7($tptr)
lea 8*8($tptr,%rcx),$tptr # start of current t[] window
cmp 8(%rsp),$carry # end of t[]?
jb .Lsqrx8x_reduction_loop
mov %rcx,$num
neg $num # restore $num
mov %rcx,%rdx # -$num
jmp .Lsqrx8x_post
___
}
##############################################################
@ -2667,24 +2677,28 @@ my ($rptr,$nptr,$lptr,$i)=($aptr,"%rbp","%rbx","%rcx");
my @ri=map("%r$_",(10..13));
my @ni=map("%r$_",(14..15));
$code.=<<___;
lea ($nptr,$num),$nptr # end of $nptr
lea 48(%rsp,$num),$lptr # end of lower half of t[2*num]
lea 48(%rsp,$num),$tptr
.align 32
.Lsqrx8x_post:
neg %rdx # restore $num
neg %rax # top-most carry as mask
mov 0*8($nptr),%r8
mov 1*8($nptr),%r9
lea ($nptr,%rdx),$nptr # end of $nptr
lea 48(%rsp,%rdx),$lptr # end of lower half of t[2*num]
lea 48(%rsp,%rdx),$tptr
.byte 0x67
xor %rdx,%rdx
movq %xmm1,$rptr # restore $rptr
mov 0*8($nptr,$i),%r8
mov 1*8($nptr,$i),%r9
neg %r8
jmp .Lsqrx8x_sub_entry
.align 32
.byte 0x66,0x66,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
.Lsqrx8x_sub:
mov 0*8($nptr,$i),%r8
mov 1*8($nptr,$i),%r9
not %r8
.Lsqrx8x_sub_entry:
.Lsqrx8x_sub_entry: # aligned at 32
mov 2*8($nptr,$i),%r10
not %r9
and %rax,%r8
@ -2709,28 +2723,27 @@ $code.=<<___;
movdqa %xmm0,2*8($lptr,$i)
and %rax,%r15
neg %rdx # mov %rdx,%cf
neg %edx # mov %edx,%cf
movdqa %xmm0,4*8($lptr,$i)
adc 0*8($tptr),%r8
mov %r8,0*8($rptr) # result
adc 1*8($tptr),%r9
movdqa %xmm0,6*8($lptr,$i)
adc 2*8($tptr),%r10
mov %r9,1*8($rptr)
adc 3*8($tptr),%r11
movdqa %xmm0,0*8($tptr) # zap upper half
adc 4*8($tptr),%r12
mov %r10,2*8($rptr)
adc 5*8($tptr),%r13
movdqa %xmm0,2*8($tptr)
adc 6*8($tptr),%r14
mov %r11,3*8($rptr)
adc 7*8($tptr),%r15
sbb %edx,%edx # mov %cf,%edx
movdqa %xmm0,4*8($tptr)
sbb %rdx,%rdx # mov %cf,%rdx
movdqa %xmm0,6*8($tptr)
lea 8*8($tptr),$tptr
mov %r8,0*8($rptr)
mov %r9,1*8($rptr)
mov %r10,2*8($rptr)
mov %r11,3*8($rptr)
mov %r12,4*8($rptr)
mov %r13,5*8($rptr)
mov %r14,6*8($rptr)