You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
433 lines
9.5 KiB
433 lines
9.5 KiB
.section .text
|
|
.proc memcpy#
|
|
.global memcpy#
|
|
.align 64
|
|
|
|
.prologue
|
|
|
|
memcpy:
|
|
|
|
{ .mmi
|
|
add r10 = 0x80, r33
|
|
add r11 = 0x80, r32
|
|
and r3 = 7, r33
|
|
} { .mmi
|
|
cmp.gt p9, p7 = r34, r0
|
|
mov r8 = r32
|
|
and r2 = 7, r32
|
|
;;
|
|
} { .mmi
|
|
(p9) lfetch [r10], 0x40
|
|
cmp.gt p14 = 0x40, r34
|
|
cmp.le p15 = 0x80, r34
|
|
} { .mmb
|
|
or r9 = r2, r3
|
|
(p9) cmp.eq p7 = r32, r33
|
|
(p7) br.ret.spnt b0
|
|
;;
|
|
} { .mmi
|
|
lfetch [r10], 0x40
|
|
lfetch.excl.nt1 [r11], 0x80
|
|
cmp.le p10, p11 = 8, r34
|
|
} {
|
|
.mbb
|
|
(p14) cmp.eq.unc p9 = 0, r9
|
|
(p11) br.cond.spnt ByteMoveUp // len < 8
|
|
(p9) br.cond.spnt QwordMoveUpLoop // len < 64 and both src and dst 8-byte aligned
|
|
;;
|
|
} { .mmi
|
|
(p15) lfetch [r10], 0x40
|
|
(p15) lfetch.excl.nt1 [r11], 0x80
|
|
sub r31 = 8, r2 // for AlignedMove
|
|
} { .mmi
|
|
(p10) cmp.eq.unc p9 = 0, r9
|
|
(p10) cmp.eq.unc p11 = r2, r3
|
|
cmp.le p8 = 0x18, r34
|
|
;;
|
|
} { .mmi
|
|
(p15) lfetch [r10], 0x40
|
|
(p15) lfetch.excl.nt1 [r11], 0x80
|
|
sub r3 = 0x10, r3 // for UnalignedMove
|
|
} { .bbb
|
|
(p9) br.cond.sptk QwordMoveUp // len >= 8 and src and dst are 8-byte aligned
|
|
(p11) br.cond.spnt AlignedMove // len >= 8 and src and dst have same alignment
|
|
(p8) br.cond.sptk UnalignedMove // len > 24
|
|
;;
|
|
}
|
|
|
|
// len <=7
|
|
ByteMoveUp:
|
|
{ .mmi
|
|
add r20 = 1, r33
|
|
add r21 = 1, r32
|
|
cmp.le p6 = 2, r34
|
|
;;
|
|
}
|
|
ByteMoveUpLoop:
|
|
{ .mmi
|
|
ld1 r2 = [r33], 2
|
|
(p6) ld1 r3 = [r20], 2
|
|
nop.i 0
|
|
} { .mmi
|
|
cmp.le p7,p10 = 3, r34
|
|
cmp.le p8 = 4, r34
|
|
nop.i 0
|
|
;;
|
|
} { .mmi
|
|
(p7) ld1 r28 = [r33], 2
|
|
(p8) ld1 r29 = [r20], 2
|
|
(p8) cmp.lt.unc p9 = 4, r34
|
|
} { .mmb
|
|
st1 [r32] = r2, 2
|
|
(p6) st1 [r21] = r3, 2
|
|
(p10) br.ret.dptk b0
|
|
;;
|
|
} { .mmi
|
|
(p7) st1 [r32] = r28, 2
|
|
(p8) st1 [r21] = r29, 2
|
|
cmp.le p6 = 6, r34
|
|
} { .mbb
|
|
add r34 = -4, r34
|
|
(p9) br.cond.dpnt ByteMoveUpLoop
|
|
br.ret.dptk b0
|
|
;;
|
|
}
|
|
|
|
//
|
|
// src & dest have same alignment
|
|
//
|
|
|
|
AlignedMove:
|
|
|
|
AlignedMoveByteLoop:
|
|
{ .mmi
|
|
ld1 r19 = [r33], 1
|
|
add r31 = -1, r31
|
|
add r34 = -1, r34
|
|
;;
|
|
|
|
} { .mmb
|
|
st1 [r32] = r19, 1
|
|
cmp.ne p7 = r0, r31
|
|
(p7) br.cond.sptk AlignedMoveByteLoop
|
|
} { .mmi
|
|
cmp.eq.unc p6 = r0, r34
|
|
cmp.gt p8 = 8, r34
|
|
cmp.le p15 = 0x80, r34
|
|
} { .mbb
|
|
nop.m 0
|
|
(p6) br.ret.spnt b0
|
|
(p8) br.cond.sptk ByteMoveUp
|
|
;;
|
|
}
|
|
|
|
// both src & dest are 8-byte aligned
|
|
|
|
QwordMoveUp:
|
|
|
|
{ .mmi
|
|
(p15) lfetch [r10], 0x40
|
|
;;
|
|
(p15) lfetch [r10], 0x40
|
|
cmp.le p0, p14 = 0x80, r34
|
|
|
|
} { .mmb
|
|
add r22 = 8, r32
|
|
add r25 = 8, r33
|
|
(p14) br.cond.spnt QwordMoveUpLoop
|
|
;;
|
|
}
|
|
|
|
.align 32
|
|
UnrolledQwordMoveUpLoop:
|
|
{ .mmi
|
|
ld8 r20 = [r25], 0x10
|
|
ld8 r30 = [r33], 0x10
|
|
add r34 = -0x40, r34
|
|
;;
|
|
} { .mmi
|
|
ld8 r21 = [r25], 0x10
|
|
ld8 r31 = [r33], 0x10
|
|
cmp.le p9 = 0x40, r34
|
|
} { .mmi
|
|
st8 [r22] = r20, 0x10
|
|
st8 [r32] = r30, 0x10
|
|
cmp.gt p8 = 8, r34
|
|
;;
|
|
} { .mmi
|
|
ld8 r20 = [r25], 0x10
|
|
ld8 r30 = [r33], 0x10
|
|
tbit.z p15 = r10, 6
|
|
|
|
} { .mmi
|
|
st8 [r22] = r21, 0x10
|
|
st8 [r32] = r31, 0x10
|
|
nop.i 0
|
|
;;
|
|
} { .mmi
|
|
ld8 r21 = [r25], 0x10
|
|
ld8 r31 = [r33], 0x10
|
|
nop.i 0
|
|
|
|
} { .mmi
|
|
st8 [r22] = r20, 0x10
|
|
st8 [r32] = r30, 0x10
|
|
nop.i 0
|
|
;;
|
|
} { .mmi
|
|
lfetch [r10], 0x40
|
|
(p15) lfetch.excl.nt1 [r11], 0x80
|
|
nop.i 0
|
|
} { .mmb
|
|
st8 [r22] = r21, 0x10
|
|
st8 [r32] = r31, 0x10
|
|
(p9) br.cond.sptk UnrolledQwordMoveUpLoop
|
|
;;
|
|
} { .mbb
|
|
cmp.eq p6 = r0, r34
|
|
(p6) br.ret.spnt b0
|
|
(p8) br.cond.spnt ByteMoveUp
|
|
;;
|
|
}
|
|
|
|
QwordMoveUpLoop:
|
|
{ .mii
|
|
ld8 r19 = [r33], 8
|
|
add r34 = -8, r34
|
|
nop.i 0
|
|
;;
|
|
} { .mmi
|
|
st8 [r32] = r19, 8
|
|
cmp.leu p7 = 8, r34
|
|
cmp.ne p6 = r0, r34
|
|
} { .bbb
|
|
(p7) br.cond.sptk QwordMoveUpLoop
|
|
(p6) br.cond.spnt ByteMoveUp
|
|
br.ret.sptk b0
|
|
;;
|
|
}
|
|
|
|
//
|
|
// Copy long unaligned region
|
|
//
|
|
NUMBER_OF_ROTATING_REGISTERS = 24 //40
|
|
RP1 = p39 //p55
|
|
RP2 = p40 //p56
|
|
RR1 = r54 //r70
|
|
RR2 = r55 //r71
|
|
UnalignedMove:
|
|
{ .mmi
|
|
.regstk 3, NUMBER_OF_ROTATING_REGISTERS - 3, 0, NUMBER_OF_ROTATING_REGISTERS
|
|
alloc r26 = ar.pfs, 3, NUMBER_OF_ROTATING_REGISTERS - 3, 0, NUMBER_OF_ROTATING_REGISTERS
|
|
(p13) lfetch [r10], 0x40
|
|
.save pr, r18
|
|
mov r18 = pr
|
|
;;
|
|
} { .mmi
|
|
(p13) lfetch [r10], 0x40
|
|
(p13) lfetch.excl.nt1 [r11], 0x80
|
|
.save ar.lc, r27
|
|
mov.i r27 = ar.lc
|
|
} { .mmi
|
|
mov r28 = r0
|
|
;;
|
|
}
|
|
|
|
.body
|
|
|
|
UnalignedMoveByteLoop:
|
|
{ .mmi
|
|
ld1 r19 = [r33], 1
|
|
cmp.ne p6 = 1, r3
|
|
mov pr.rot = 3<<0x10
|
|
;;
|
|
} { .mib
|
|
add r3 = -1, r3
|
|
shrp r28 = r19, r28, 8
|
|
nop.b 0
|
|
} { .mib
|
|
st1 [r32] = r19, 1
|
|
add r34 = -1, r34
|
|
(p6) br.cond.sptk UnalignedMoveByteLoop
|
|
;;
|
|
} { .mmi
|
|
mov r3 = r33
|
|
and r2 = 7, r32
|
|
mov r33 = r28
|
|
;;
|
|
} { .mmi
|
|
add r9 = r34, r2
|
|
sub r29 = r32, r2
|
|
cmp.eq p6 = 2, r2
|
|
;;
|
|
} { .mii
|
|
cmp.eq p9 = 4, r2
|
|
shr r19 = r9, 3
|
|
cmp.eq p11 = 6, r2
|
|
;;
|
|
} { .mii
|
|
add r19 = -1, r19
|
|
and r9 = 7, r9
|
|
mov.i ar.ec = NUMBER_OF_ROTATING_REGISTERS
|
|
;;
|
|
} { .mmi
|
|
lfetch [r10], 0x40
|
|
lfetch.excl.nt1 [r11], 0x40
|
|
mov.i ar.lc = r19
|
|
} { .bbb
|
|
(p6) br.cond.spnt SpecialLoop2
|
|
(p9) br.cond.spnt SpecialLoop4
|
|
(p11) br.cond.spnt SpecialLoop6
|
|
;;
|
|
} { .mii
|
|
cmp.eq p7 = 3, r2
|
|
cmp.eq p10 = 5, r2
|
|
cmp.eq p12 = 7, r2
|
|
} { .bbb
|
|
(p7) br.cond.spnt SpecialLoop3
|
|
(p10) br.cond.spnt SpecialLoop5
|
|
(p12) br.cond.spnt SpecialLoop7
|
|
;;
|
|
}
|
|
|
|
.align 32
|
|
|
|
SpecialLoop1:
|
|
{ .mmi
|
|
(p16) ld8 r32 = [r3], 8
|
|
(RP2) st8 [r29] = r28, 8
|
|
(RP1) shrp r28 = RR1, RR2, 0x38
|
|
} { .mib
|
|
br.ctop.sptk.many SpecialLoop1
|
|
;;
|
|
} { .mib
|
|
sub r3 = r3, r2
|
|
mov pr = r18
|
|
br UnalignedByteDone
|
|
;;
|
|
}
|
|
|
|
.align 32
|
|
|
|
SpecialLoop2:
|
|
{ .mmi
|
|
(p16) ld8 r32 = [r3], 8
|
|
(RP2) st8 [r29] = r28, 8
|
|
(RP1) shrp r28 = RR1, RR2, 0x30
|
|
} { .mib
|
|
br.ctop.sptk.many SpecialLoop2
|
|
;;
|
|
} { .mib
|
|
sub r3 = r3, r2
|
|
mov pr = r18
|
|
br UnalignedByteDone
|
|
;;
|
|
}
|
|
|
|
.align 32
|
|
|
|
SpecialLoop3:
|
|
{ .mmi
|
|
(p16) ld8 r32 = [r3], 8
|
|
(RP2) st8 [r29] = r28, 8
|
|
(RP1) shrp r28 = RR1, RR2, 0x28
|
|
} { .mib
|
|
br.ctop.sptk.many SpecialLoop3
|
|
;;
|
|
} { .mib
|
|
sub r3 = r3, r2
|
|
mov pr = r18
|
|
br UnalignedByteDone
|
|
;;
|
|
}
|
|
|
|
.align 32
|
|
|
|
SpecialLoop4:
|
|
{ .mmi
|
|
(p16) ld8 r32 = [r3], 8
|
|
(RP2) st8 [r29] = r28, 8
|
|
(RP1) shrp r28 = RR1, RR2, 0x20
|
|
} { .mib
|
|
br.ctop.sptk.many SpecialLoop4
|
|
;;
|
|
} { .mib
|
|
sub r3 = r3, r2
|
|
mov pr = r18
|
|
br UnalignedByteDone
|
|
;;
|
|
}
|
|
|
|
.align 32
|
|
|
|
SpecialLoop5:
|
|
{ .mmi
|
|
(p16) ld8 r32 = [r3], 8
|
|
(RP2) st8 [r29] = r28, 8
|
|
(RP1) shrp r28 = RR1, RR2, 0x18
|
|
} { .mib
|
|
br.ctop.sptk.many SpecialLoop5
|
|
;;
|
|
} { .mib
|
|
sub r3 = r3, r2
|
|
mov pr = r18
|
|
br UnalignedByteDone
|
|
;;
|
|
}
|
|
|
|
.align 32
|
|
|
|
SpecialLoop6:
|
|
{ .mmi
|
|
(p16) ld8 r32 = [r3], 8
|
|
(RP2) st8 [r29] = r28, 8
|
|
(RP1) shrp r28 = RR1, RR2, 0x10
|
|
} { .mib
|
|
br.ctop.sptk.many SpecialLoop6
|
|
;;
|
|
} { .mib
|
|
sub r3 = r3, r2
|
|
mov pr = r18
|
|
br UnalignedByteDone
|
|
;;
|
|
}
|
|
|
|
.align 32
|
|
|
|
SpecialLoop7:
|
|
{ .mmi
|
|
(p16) ld8 r32 = [r3], 8
|
|
(RP2) st8 [r29] = r28, 8
|
|
(RP1) shrp r28 = RR1, RR2, 0x8
|
|
} { .mib
|
|
br.ctop.sptk.many SpecialLoop7
|
|
;;
|
|
} { .mii
|
|
sub r3 = r3, r2
|
|
mov pr = r18
|
|
nop.i 0
|
|
;;
|
|
}
|
|
|
|
UnalignedByteDone:
|
|
{ .mib
|
|
cmp.eq p6 = r0, r9
|
|
mov.i ar.lc = r27
|
|
(p6) br.ret.spnt b0
|
|
;;
|
|
}
|
|
|
|
UnAlignedByteDoneLoop:
|
|
{ .mii
|
|
ld1 r19 = [r3], 1
|
|
add r9 = -1, r9
|
|
;;
|
|
cmp.ne p7 = r0, r9
|
|
} { .mbb
|
|
st1 [r29] = r19, 1
|
|
(p7) br.cond.sptk UnAlignedByteDoneLoop
|
|
br.ret.spnt b0
|
|
;;
|
|
}
|
|
|
|
.endp memcpy#
|