Leaked source code of windows server 2003
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

176 lines
4.3 KiB

  1. // string.s: function to concatenate 2 strings
  2. // Copyright (c) 2000, Intel Corporation
  3. // All rights reserved.
  4. //
  5. // WARRANTY DISCLAIMER
  6. //
  7. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  8. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  9. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  10. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  11. // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  12. // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  13. // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  14. // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  15. // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  16. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  17. // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  18. //
  19. // Intel Corporation is the author of this code, and requests that all
  20. // problem reports or change requests be submitted to it directly at
  21. // http://developer.intel.com/opensource.
  22. //
  23. .file "strcat.s"
  24. .section .text
  25. // -- Begin strcat
  26. .proc strcat#
  27. .global strcat#
  28. .align 32
  29. .prologue
  30. strcat:
  31. { .mib
  32. alloc r14=ar.pfs,2,6,0,8 //8 rotating registers, 7 locals
  33. .save pr, r11
  34. mov r11=pr //Save predicate register file
  35. brp.loop.imp .bs1len, .bws1 // Put loop backedge target in TAR
  36. }
  37. .body
  38. // Setup for doing software pipelined loops
  39. { .mib
  40. mov r9=r33
  41. mov pr.rot=0x30000 // p16=p17=1
  42. nop.b 0
  43. };;
  44. { .mib
  45. mov r14=r32
  46. mov ar.ec=0
  47. nop.b 0
  48. } { .mib // Extra bundle to align bs1len.
  49. mov r8=r32
  50. nop.i 0
  51. brp.loop.imp .bcat, .bwcat ;; // Put loop backedge target in TAR
  52. }
  53. .bs1len:
  54. { .mii
  55. ld1.s r37=[r14],1 // *s (r37,r38,r39)
  56. nop.i 0
  57. (p19) chk.s r39,.natfault1_0 //
  58. }
  59. .bws1:
  60. { .mfb
  61. (p19) cmp4.ne p17,p0=r39,r0 // *s==0 (p16,p17,p18)
  62. nop.f 0
  63. (p17) br.wtop.dptk .bs1len ;; //
  64. }
  65. //
  66. // Now concatenate s2 into the end of s1
  67. //
  68. { .mib
  69. add r14=-3,r14 // Since ld1.s is 2 stages ahead
  70. dep r15=1,r0,32,32 // rb = 0xffffffff00000000
  71. clrrrb ;;
  72. } { .mii
  73. // Setup for doing software pipelined loops
  74. or r32=r14,r9
  75. mov pr.rot=0x30000 ;; // p16=p17=1
  76. and r32=3,r32 ;;
  77. } { .mib
  78. cmp4.ne p10,p0=r32,r0
  79. mov ar.ec=0
  80. (p10) br.spnt .b_notaligned ;;
  81. }
  82. .bcat:
  83. { .mii
  84. ld4.s r32=[r9],4 // *s1 (r32,r33,r34)
  85. (p18) chk.s r33,.natfault2_0 //
  86. (p18) pcmp1.eq r16=r33,r15 ;; // r16 !=0 only if a zero byte is found
  87. }
  88. .bwcat:
  89. { .mib
  90. (p19) st4 [r14]=r34,4 // *s2=*s1
  91. (p18) cmp4.eq p17,p0=r16,r0 // zero byte found?
  92. (p17) br.wtop.dptk .bcat ;; //
  93. }
  94. { .mfi
  95. nop.m 0
  96. nop.f 0
  97. czx1.r r16 = r33
  98. } ;;
  99. { .mfi
  100. cmp.leu p2, p0 = 2, r16
  101. nop.f 0
  102. shr.u r35 = r33, 8
  103. }
  104. { .mfi
  105. cmp.eq p4, p0 = 3, r16
  106. nop.f 0
  107. cmp.ne p5, p0 = r0, r16
  108. } ;;
  109. { .mfi
  110. (p5)st1 [r14] = r33, 1
  111. nop.f 0
  112. shr.u r36 = r33, 16
  113. };;
  114. { .mfi
  115. (p2)st1 [r14] = r35,1
  116. nop.f 0
  117. nop.i 0
  118. } ;;
  119. { .mfi
  120. (p4)st1 [r14] = r36,1
  121. nop.f 0
  122. nop.i 0
  123. };;
  124. { .mib
  125. (p0) st1 [r14] = r0
  126. nop.i 0
  127. clrrrb
  128. } ;;
  129. { .mib
  130. nop.m 0
  131. mov pr=r11,0x1003e
  132. br.ret.sptk.many b0 ;;
  133. }
  134. .b_notaligned:
  135. { .mmi
  136. ld1 r32=[r9],1 ;; // 2 cycle load causes 1 cycle stall
  137. st1 [r14]=r32,1 // 3 cycles between st1 to avoid flush
  138. cmp4.ne.unc p7,p0=r32,r0 ;; // Extra stop bit to force 3 cycles
  139. } { .mib
  140. nop.m 0
  141. nop.i 0
  142. (p7) br.cond.dptk .b_notaligned ;;
  143. } { .mib
  144. nop.m 0
  145. mov pr=r11,0x1003e
  146. br.ret.sptk.many b0 ;;
  147. }
  148. .natfault1_0:
  149. { .mmi
  150. add r39=-3,r14 ;;
  151. ld1 r39=[r39] // Redo the load
  152. nop.i 0
  153. } { .mib
  154. nop.m 0
  155. nop.i 0
  156. br.sptk .bws1 ;;
  157. }
  158. .natfault2_0:
  159. { .mmi
  160. add r33=-8,r9 ;;
  161. ld4 r33=[r33] // *s1 (r32,r33,r34)
  162. nop.i 0;;
  163. } { .mib
  164. nop.m 0
  165. (p18) pcmp1.eq r16=r33,r15 // r16 !=0 only if a zero byte is found
  166. br.sptk .bwcat ;;
  167. }
  168. _2_1_2auto_size == 0x0
  169. // -- End strcat
  170. .endp strcat#
  171. // mark_proc_addr_taken strcat;
  172. // End