Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

433 lines
9.5 KiB

.section .text
.proc memcpy#
.global memcpy#
.align 64
.prologue
memcpy:
{ .mmi
add r10 = 0x80, r33
add r11 = 0x80, r32
and r3 = 7, r33
} { .mmi
cmp.gt p9, p7 = r34, r0
mov r8 = r32
and r2 = 7, r32
;;
} { .mmi
(p9) lfetch [r10], 0x40
cmp.gt p14 = 0x40, r34
cmp.le p15 = 0x80, r34
} { .mmb
or r9 = r2, r3
(p9) cmp.eq p7 = r32, r33
(p7) br.ret.spnt b0
;;
} { .mmi
lfetch [r10], 0x40
lfetch.excl.nt1 [r11], 0x80
cmp.le p10, p11 = 8, r34
} {
.mbb
(p14) cmp.eq.unc p9 = 0, r9
(p11) br.cond.spnt ByteMoveUp // len < 8
(p9) br.cond.spnt QwordMoveUpLoop // len < 64 and both src and dst 8-byte aligned
;;
} { .mmi
(p15) lfetch [r10], 0x40
(p15) lfetch.excl.nt1 [r11], 0x80
sub r31 = 8, r2 // for AlignedMove
} { .mmi
(p10) cmp.eq.unc p9 = 0, r9
(p10) cmp.eq.unc p11 = r2, r3
cmp.le p8 = 0x18, r34
;;
} { .mmi
(p15) lfetch [r10], 0x40
(p15) lfetch.excl.nt1 [r11], 0x80
sub r3 = 0x10, r3 // for UnalignedMove
} { .bbb
(p9) br.cond.sptk QwordMoveUp // len >= 8 and src and dst are 8-byte aligned
(p11) br.cond.spnt AlignedMove // len >= 8 and src and dst have same alignment
(p8) br.cond.sptk UnalignedMove // len > 24
;;
}
// len <=7
ByteMoveUp:
{ .mmi
add r20 = 1, r33
add r21 = 1, r32
cmp.le p6 = 2, r34
;;
}
ByteMoveUpLoop:
{ .mmi
ld1 r2 = [r33], 2
(p6) ld1 r3 = [r20], 2
nop.i 0
} { .mmi
cmp.le p7,p10 = 3, r34
cmp.le p8 = 4, r34
nop.i 0
;;
} { .mmi
(p7) ld1 r28 = [r33], 2
(p8) ld1 r29 = [r20], 2
(p8) cmp.lt.unc p9 = 4, r34
} { .mmb
st1 [r32] = r2, 2
(p6) st1 [r21] = r3, 2
(p10) br.ret.dptk b0
;;
} { .mmi
(p7) st1 [r32] = r28, 2
(p8) st1 [r21] = r29, 2
cmp.le p6 = 6, r34
} { .mbb
add r34 = -4, r34
(p9) br.cond.dpnt ByteMoveUpLoop
br.ret.dptk b0
;;
}
//
// src & dest have same alignment
//
AlignedMove:
AlignedMoveByteLoop:
{ .mmi
ld1 r19 = [r33], 1
add r31 = -1, r31
add r34 = -1, r34
;;
} { .mmb
st1 [r32] = r19, 1
cmp.ne p7 = r0, r31
(p7) br.cond.sptk AlignedMoveByteLoop
} { .mmi
cmp.eq.unc p6 = r0, r34
cmp.gt p8 = 8, r34
cmp.le p15 = 0x80, r34
} { .mbb
nop.m 0
(p6) br.ret.spnt b0
(p8) br.cond.sptk ByteMoveUp
;;
}
// both src & dest are 8-byte aligned
QwordMoveUp:
{ .mmi
(p15) lfetch [r10], 0x40
;;
(p15) lfetch [r10], 0x40
cmp.le p0, p14 = 0x80, r34
} { .mmb
add r22 = 8, r32
add r25 = 8, r33
(p14) br.cond.spnt QwordMoveUpLoop
;;
}
.align 32
UnrolledQwordMoveUpLoop:
{ .mmi
ld8 r20 = [r25], 0x10
ld8 r30 = [r33], 0x10
add r34 = -0x40, r34
;;
} { .mmi
ld8 r21 = [r25], 0x10
ld8 r31 = [r33], 0x10
cmp.le p9 = 0x40, r34
} { .mmi
st8 [r22] = r20, 0x10
st8 [r32] = r30, 0x10
cmp.gt p8 = 8, r34
;;
} { .mmi
ld8 r20 = [r25], 0x10
ld8 r30 = [r33], 0x10
tbit.z p15 = r10, 6
} { .mmi
st8 [r22] = r21, 0x10
st8 [r32] = r31, 0x10
nop.i 0
;;
} { .mmi
ld8 r21 = [r25], 0x10
ld8 r31 = [r33], 0x10
nop.i 0
} { .mmi
st8 [r22] = r20, 0x10
st8 [r32] = r30, 0x10
nop.i 0
;;
} { .mmi
lfetch [r10], 0x40
(p15) lfetch.excl.nt1 [r11], 0x80
nop.i 0
} { .mmb
st8 [r22] = r21, 0x10
st8 [r32] = r31, 0x10
(p9) br.cond.sptk UnrolledQwordMoveUpLoop
;;
} { .mbb
cmp.eq p6 = r0, r34
(p6) br.ret.spnt b0
(p8) br.cond.spnt ByteMoveUp
;;
}
QwordMoveUpLoop:
{ .mii
ld8 r19 = [r33], 8
add r34 = -8, r34
nop.i 0
;;
} { .mmi
st8 [r32] = r19, 8
cmp.leu p7 = 8, r34
cmp.ne p6 = r0, r34
} { .bbb
(p7) br.cond.sptk QwordMoveUpLoop
(p6) br.cond.spnt ByteMoveUp
br.ret.sptk b0
;;
}
//
// Copy long unaligned region
//
NUMBER_OF_ROTATING_REGISTERS = 24 //40
RP1 = p39 //p55
RP2 = p40 //p56
RR1 = r54 //r70
RR2 = r55 //r71
UnalignedMove:
{ .mmi
.regstk 3, NUMBER_OF_ROTATING_REGISTERS - 3, 0, NUMBER_OF_ROTATING_REGISTERS
alloc r26 = ar.pfs, 3, NUMBER_OF_ROTATING_REGISTERS - 3, 0, NUMBER_OF_ROTATING_REGISTERS
(p13) lfetch [r10], 0x40
.save pr, r18
mov r18 = pr
;;
} { .mmi
(p13) lfetch [r10], 0x40
(p13) lfetch.excl.nt1 [r11], 0x80
.save ar.lc, r27
mov.i r27 = ar.lc
} { .mmi
mov r28 = r0
;;
}
.body
UnalignedMoveByteLoop:
{ .mmi
ld1 r19 = [r33], 1
cmp.ne p6 = 1, r3
mov pr.rot = 3<<0x10
;;
} { .mib
add r3 = -1, r3
shrp r28 = r19, r28, 8
nop.b 0
} { .mib
st1 [r32] = r19, 1
add r34 = -1, r34
(p6) br.cond.sptk UnalignedMoveByteLoop
;;
} { .mmi
mov r3 = r33
and r2 = 7, r32
mov r33 = r28
;;
} { .mmi
add r9 = r34, r2
sub r29 = r32, r2
cmp.eq p6 = 2, r2
;;
} { .mii
cmp.eq p9 = 4, r2
shr r19 = r9, 3
cmp.eq p11 = 6, r2
;;
} { .mii
add r19 = -1, r19
and r9 = 7, r9
mov.i ar.ec = NUMBER_OF_ROTATING_REGISTERS
;;
} { .mmi
lfetch [r10], 0x40
lfetch.excl.nt1 [r11], 0x40
mov.i ar.lc = r19
} { .bbb
(p6) br.cond.spnt SpecialLoop2
(p9) br.cond.spnt SpecialLoop4
(p11) br.cond.spnt SpecialLoop6
;;
} { .mii
cmp.eq p7 = 3, r2
cmp.eq p10 = 5, r2
cmp.eq p12 = 7, r2
} { .bbb
(p7) br.cond.spnt SpecialLoop3
(p10) br.cond.spnt SpecialLoop5
(p12) br.cond.spnt SpecialLoop7
;;
}
.align 32
SpecialLoop1:
{ .mmi
(p16) ld8 r32 = [r3], 8
(RP2) st8 [r29] = r28, 8
(RP1) shrp r28 = RR1, RR2, 0x38
} { .mib
br.ctop.sptk.many SpecialLoop1
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone
;;
}
.align 32
SpecialLoop2:
{ .mmi
(p16) ld8 r32 = [r3], 8
(RP2) st8 [r29] = r28, 8
(RP1) shrp r28 = RR1, RR2, 0x30
} { .mib
br.ctop.sptk.many SpecialLoop2
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone
;;
}
.align 32
SpecialLoop3:
{ .mmi
(p16) ld8 r32 = [r3], 8
(RP2) st8 [r29] = r28, 8
(RP1) shrp r28 = RR1, RR2, 0x28
} { .mib
br.ctop.sptk.many SpecialLoop3
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone
;;
}
.align 32
SpecialLoop4:
{ .mmi
(p16) ld8 r32 = [r3], 8
(RP2) st8 [r29] = r28, 8
(RP1) shrp r28 = RR1, RR2, 0x20
} { .mib
br.ctop.sptk.many SpecialLoop4
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone
;;
}
.align 32
SpecialLoop5:
{ .mmi
(p16) ld8 r32 = [r3], 8
(RP2) st8 [r29] = r28, 8
(RP1) shrp r28 = RR1, RR2, 0x18
} { .mib
br.ctop.sptk.many SpecialLoop5
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone
;;
}
.align 32
SpecialLoop6:
{ .mmi
(p16) ld8 r32 = [r3], 8
(RP2) st8 [r29] = r28, 8
(RP1) shrp r28 = RR1, RR2, 0x10
} { .mib
br.ctop.sptk.many SpecialLoop6
;;
} { .mib
sub r3 = r3, r2
mov pr = r18
br UnalignedByteDone
;;
}
.align 32
SpecialLoop7:
{ .mmi
(p16) ld8 r32 = [r3], 8
(RP2) st8 [r29] = r28, 8
(RP1) shrp r28 = RR1, RR2, 0x8
} { .mib
br.ctop.sptk.many SpecialLoop7
;;
} { .mii
sub r3 = r3, r2
mov pr = r18
nop.i 0
;;
}
UnalignedByteDone:
{ .mib
cmp.eq p6 = r0, r9
mov.i ar.lc = r27
(p6) br.ret.spnt b0
;;
}
UnAlignedByteDoneLoop:
{ .mii
ld1 r19 = [r3], 1
add r9 = -1, r9
;;
cmp.ne p7 = r0, r9
} { .mbb
st1 [r29] = r19, 1
(p7) br.cond.sptk UnAlignedByteDoneLoop
br.ret.spnt b0
;;
}
.endp memcpy#