Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

567 lines
13 KiB

.section .text
.file "memcpy.s"
.proc _memcpy_ita#
.global _memcpy_ita#
.align 32
.prologue
_memcpy_ita:
{ .mii
and r10 = -32, r33
.save pr, r18
mov r18 = pr
cmp.eq p7, p9 = r0, r34
} { .mmi
cmp.ltu p8 = 24, r34
;;
(p9) lfetch [r10], 32 //0
.save ar.lc, r27
mov.i r27 = ar.lc
} { .mib
mov r8 = r32
cmp.eq.or p7 = r32, r33
(p7) br.ret.spnt b0
;;
} { .mii
(p8) lfetch [r10], 32 //32 //(p8)
and r2 = 7, r32
and r3 = 7, r33
;;
} { .mii
(p8) lfetch [r10], 32 //64 //(p8)
or r9 = r2, r3
cmp.gtu p14, p13 = 64, r34
;;
} { .mii
(p13) lfetch [r10], 32 //96 //(p13)
cmp.ltu p12 = 127, r34
cmp.leu p10, p11 = 8, r34
} { .mbb
(p14) cmp.eq.unc p9 = 0, r9 // This bundle is just a shortcut
(p11) br.cond.spnt ByteMoveUpLoop_ita
(p9) br.cond.spnt QwordMoveUpLoop_ita
;;
} { .mii
(p12) lfetch [r10], 32 //128
(p10) cmp.eq.unc p9 = 0, r9
(p10) cmp.eq.unc p11 = r2, r3
} { .bbb
(p9) br.cond.sptk QwordMoveUp_ita
(p11) br.cond.spnt AlignedMove_ita
(p8) br.cond.sptk UnalignedMove_ita
;;
}
ByteMoveUpLoop_ita:
{ .mii
add r20 = 2, r33
cmp.leu p6 = 2, r34
cmp.leu p7 = 3, r34
} { .mmi
ld1 r19 = [r33], 1
;;
(p6) ld1 r2 = [r33], 2
add r21 = 1, r32
} { .mii
(p7) ld1 r3 = [r20]
cmp.leu p8 = 4, r34
;;
add r34 = -4, r34
} { .mmi
(p8) ld1 r22 = [r33], 1
;;
st1 [r32] = r19, 2
(p8) cmp.ltu.unc p9 = r0, r34
} { .mmi
(p6) st1 [r21] = r2, 2
;;
(p7) st1 [r32] = r3, 2
nop.i 0
} { .mbb
(p8) st1 [r21] = r22
(p9) br.cond.dpnt ByteMoveUpLoop_ita
br.ret.dptk b0
;;
}
//
// src & dest have same alignment, 0 != (align mod 8)
//
AlignedMove_ita:
{ .mmi
add r11 = 64, r10
(p12) lfetch [r10], 32 //160
sub r31 = 8, r2
;;
} { .mmi
(p12) lfetch [r10], 64 //192
(p12) lfetch [r11], 96 //224
sub r34 = r34, r31
;;
}
AlignedMoveByteLoop_ita:
{ .mii
ld1 r19 = [r33], 1
add r31 = -1, r31
cmp.gtu p14 = 64, r34
;;
} { .mib
st1 [r32] = r19, 1
cmp.ne p7 = r0, r31
(p7) br.cond.sptk AlignedMoveByteLoop_ita
;;
} { .mii
(p12) lfetch [r10], 32 //256
cmp.eq.unc p6 = r0, r34
cmp.gtu p8 = 8, r34
} { .mbb
(p12) lfetch [r11], 128 //320
(p6) br.ret.spnt b0
(p8) br.cond.sptk ByteMoveUpLoop_ita
;;
}
//
// both src & dest are now 8-byte aligned
//
QwordMoveUp_ita:
#if defined (USE_HIGH_FP_REGISTERS)
{ .mii
add r16 = 8, r33
add r10 = 128, r33
add r11 = 288, r33
} { .mmi
mov r19 = 1536
;;
add r17 = 8, r32
tbit.nz p6 = r33, 3
} { .mbb
cmp.leu p9 = r19, r34
(p9) br.cond.spnt LargeAlignedUp_ita
(p14) br.cond.spnt QwordMoveUpLoop_ita
;;
}
#else
{ .mii
add r16 = 8, r33
add r10 = 128, r33
add r11 = 288, r33
} { .mfb
add r17 = 8, r32
nop.f 0
(p14) br.cond.spnt QwordMoveUpLoop_ita
;;
}
#endif
UnrolledQwordMoveUpLoop_ita:
{ .mmi
ld8 r19 = [r33], 16
ld8 r20 = [r16], 16
add r34 = -64, r34
;;
} { .mmi
ld8 r21 = [r33], 16
ld8 r22 = [r16], 16
cmp.leu p9 = 128, r34
;;
} { .mmi
ld8 r30 = [r33], 16
ld8 r29 = [r16], 16
cmp.gtu p8 = 8, r34
;;
} { .mmi
ld8 r25 = [r33], 16
ld8 r26 = [r16], 16
cmp.leu p7 = 64, r34
;;
} { .mmi
(p9) lfetch [r10], 64
(p9) lfetch [r11], 64
nop.i 0
;;
} { .mmi
st8 [r32] = r19, 16
st8 [r17] = r20, 16
nop.i 0
;;
} { .mmi
st8 [r32] = r21, 16
st8 [r17] = r22, 16
nop.i 0
;;
} { .mmi
st8 [r32] = r30, 16
st8 [r17] = r29, 16
nop.i 0
;;
} { .mmb
st8 [r32] = r25, 16
st8 [r17] = r26, 16
(p7) br.cond.dptk UnrolledQwordMoveUpLoop_ita
;;
} { .mbb
cmp.eq p6 = r0, r34
(p6) br.ret.spnt b0
(p8) br.cond.spnt ByteMoveUpLoop_ita
;;
}
QwordMoveUpLoop_ita:
{ .mii
ld8 r19 = [r33], 8
add r34 = -8, r34
nop.i 0
;;
} { .mmi
st8 [r32] = r19, 8
cmp.leu p7 = 8, r34
cmp.ne p6 = r0, r34
} { .bbb
(p7) br.cond.sptk QwordMoveUpLoop_ita
(p6) br.cond.spnt ByteMoveUpLoop_ita
br.ret.sptk b0
;;
}
#if defined (USE_HIGH_FP_REGISTERS)
.align 32
//
// Copy large aligned region -- we can use FP registers for that
// NOTE: still use unrolled loop for *very* large blocks,
// as there are good chances that data is not in cache.
//
LargeAlignedUp_ita:
{ .mmi
mov r20 = 48*1024
and r31 = 7, r34
mov.i ar.ec = 23
;;
} { .mbb
cmp.ltu p8 = r20, r34
(p8) br.cond.spnt UnrolledQwordMoveUpLoop_ita
brp.sptk.imp Move32UpLoop_ita, Move32UpLoopE_ita
;;
} { .mii
(p6) ld8 r9 = [r33], 8
mov pr.rot = 1<<16
(p6) add r34 = -8, r34
;;
} { .mii
(p6) st8 [r32] = r9, 8
shr.u r30 = r34, 5
mov r10 = r33
} { .mmi
add r11 = 16, r33
;;
mov r20 = r32
add r30 = -1, r30
} { .mii
and r9 = 31, r34
add r21 = 8, r32
;;
mov.i ar.lc = r30
}
Move32UpLoop_ita:
{ .mmi
(p16) ldfp8 f32, f55 = [r10]
(p16) ldfp8 f78, f101 = [r11]
(p16) add r10 = 32, r10
;;
} { .mmi
(p38) stf8 [r20] = f54, 16
(p38) stf8 [r21] = f77, 16
(p16) add r11 = 32, r11
;;
} { .mmb
Move32UpLoopE_ita:
(p38) stf8 [r20] = f100, 16
(p38) stf8 [r21] = f123, 16
br.ctop.sptk.many Move32UpLoop_ita
;;
} { .mii
nop.m 0
mov pr = r18
nop.i 0
;;
} { .mii
cmp.eq p6 = r0, r9
mov.i ar.lc = r27
cmp.gt p8 = 8, r9
} { .mbb
cmp.eq p9 = r0, r31
(p6) br.ret.spnt b0
(p8) br.cond.spnt LargeByteDoneUpLoop_ita
;;
}
LargeMoveUpLoop_ita:
{ .mii
ld8 r19 = [r10], 8
add r9 = -8, r9
;;
cmp.le p7 = 8, r9
} { .mbb
st8 [r20] = r19, 8
(p7) br.cond.sptk LargeMoveUpLoop_ita
(p9) br.ret.spnt b0
;;
}
LargeByteDoneUpLoop_ita:
{ .mii
ld1 r19 = [r10], 1
add r9 = -1, r9
;;
cmp.ne p7 = r0, r9
} { .mbb
st1 [r20] = r19, 1
(p7) br.cond.sptk LargeByteDoneUpLoop_ita
br.ret.spnt b0
;;
}
#endif
//
// Copy long unaligned region
//
.align 32
UnalignedMove_ita:
{ .mii
.regstk 3, 29, 0, 32
alloc r26 = ar.pfs, 3, 29, 0, 32
mov.i ar.ec = 32
sub r3 = 16, r3
;;
}
.body
UnalignedMoveByteLoop_ita:
{ .mmi
ld1 r19 = [r33], 1
cmp.ne p6 = 1, r3
mov pr.rot = 3<<16
;;
} { .mib
add r3 = -1, r3
shrp r10 = r19, r10, 8
nop.b 0
} { .mib
st1 [r32] = r19, 1
add r34 = -1, r34
(p6) br.cond.sptk UnalignedMoveByteLoop_ita
;;
} { .mmi
mov r3 = r33
and r2 = 7, r32
mov r33 = r10
;;
} { .mmi
add r9 = r34, r2
sub r11 = r32, r2
cmp.eq p6 = 2, r2
;;
} { .mii
cmp.eq p9 = 4, r2
shr r19 = r9, 3
cmp.eq p11 = 6, r2
;;
} { .mii
add r19 = -1, r19
and r9 = 7, r9
;;
mov.i ar.lc = r19
} { .bbb
(p6) br.cond.spnt SpecialLoop2_ita
(p9) br.cond.spnt SpecialLoop4_ita
(p11) br.cond.spnt SpecialLoop6_ita
;;
} { .mii
cmp.eq p7 = 3, r2
cmp.eq p10 = 5, r2
cmp.eq p12 = 7, r2
} { .bbb
(p7) br.cond.spnt SpecialLoop3_ita
(p10) br.cond.spnt SpecialLoop5_ita
(p12) br.cond.spnt SpecialLoop7_ita
;;
}
.align 32
SpecialLoop1_ita:
{ .mfb
(p16) ld8 r32 = [r3], 8
nop.f 0
brp.sptk.imp SpecialLoop1_ita, SpecialLoop1E_ita
} { .mib
SpecialLoop1E_ita:
(p48) st8 [r11] = r10, 8
(p47) shrp r10 = r62, r63, 56
br.ctop.sptk.many SpecialLoop1_ita
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone_ita
;;
}
.align 32
SpecialLoop2_ita:
{ .mfb
(p16) ld8 r32 = [r3], 8
nop.f 0
brp.sptk.imp SpecialLoop2_ita, SpecialLoop2E_ita
} { .mib
SpecialLoop2E_ita:
(p48) st8 [r11] = r10, 8
(p47) shrp r10 = r62, r63, 48
br.ctop.sptk.many SpecialLoop2_ita
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone_ita
;;
}
.align 32
SpecialLoop3_ita:
{ .mfb
(p16) ld8 r32 = [r3], 8
nop.f 0
brp.sptk.imp SpecialLoop3_ita, SpecialLoop3E_ita
} { .mib
SpecialLoop3E_ita:
(p48) st8 [r11] = r10, 8
(p47) shrp r10 = r62, r63, 40
br.ctop.sptk.many SpecialLoop3_ita
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone_ita
;;
}
.align 32
SpecialLoop4_ita:
{ .mfb
(p16) ld8 r32 = [r3], 8
nop.f 0
brp.sptk.imp SpecialLoop4_ita, SpecialLoop4E_ita
} { .mib
SpecialLoop4E_ita:
(p48) st8 [r11] = r10, 8
(p47) shrp r10 = r62, r63, 32
br.ctop.sptk.many SpecialLoop4_ita
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone_ita
;;
}
.align 32
SpecialLoop5_ita:
{ .mfb
(p16) ld8 r32 = [r3], 8
nop.f 0
brp.sptk.imp SpecialLoop5_ita, SpecialLoop5E_ita
} { .mib
SpecialLoop5E_ita:
(p48) st8 [r11] = r10, 8
(p47) shrp r10 = r62, r63, 24
br.ctop.sptk.many SpecialLoop5_ita
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone_ita
;;
}
.align 32
SpecialLoop6_ita:
{ .mfb
(p16) ld8 r32 = [r3], 8
nop.f 0
brp.sptk.imp SpecialLoop6_ita, SpecialLoop6E_ita
} { .mib
SpecialLoop6E_ita:
(p48) st8 [r11] = r10, 8
(p47) shrp r10 = r62, r63, 16
br.ctop.sptk.many SpecialLoop6_ita
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone_ita
;;
}
.align 32
SpecialLoop7_ita:
{ .mfb
(p16) ld8 r32 = [r3], 8
nop.f 0
brp.sptk.imp SpecialLoop7_ita, SpecialLoop7E_ita
} { .mib
SpecialLoop7E_ita:
(p48) st8 [r11] = r10, 8
(p47) shrp r10 = r62, r63, 8
br.ctop.sptk.many SpecialLoop7_ita
;;
} { .mii
sub r3 = r3, r2
mov pr = r18
nop.i 0
;;
}
UnalignedByteDone_ita:
{ .mib
cmp.eq p6 = r0, r9
mov.i ar.lc = r27
(p6) br.ret.spnt b0
;;
}
UnAlignedByteDoneLoop_ita:
{ .mii
ld1 r19 = [r3], 1
add r9 = -1, r9
;;
cmp.ne p7 = r0, r9
} { .mbb
st1 [r11] = r19, 1
(p7) br.cond.sptk UnAlignedByteDoneLoop_ita
br.ret.spnt b0
;;
}
.endp _memcpy_ita#