Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1000 lines
34 KiB

  1. /*****************************************************************************
  2. * Copyright (C) 2000-2001 Andre McCurdy <[email protected]>
  3. *
  4. * This program is free software. you can redistribute it and/or modify
  5. * it under the terms of the GNU General Public License as published by
  6. * the Free Software Foundation@ either version 2 of the License, or
  7. * (at your option) any later version.
  8. *
  9. * This program is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY, without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. * GNU General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU General Public License
  15. * along with this program@ if not, write to the Free Software
  16. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  17. *
  18. *****************************************************************************
  19. *
  20. * Notes:
  21. *
  22. *
  23. *****************************************************************************
  24. *
  25. * $Id: imdct_l_arm.S,v 1.7 2001/03/25 20:03:34 rob Rel $
  26. *
  27. * 2001/03/24: Andre McCurdy <[email protected]>
  28. * - Corrected PIC unsafe loading of address of 'imdct36_long_karray'
  29. *
  30. * 2000/09/20: Robert Leslie <[email protected]>
  31. * - Added a global symbol with leading underscore per suggestion of
  32. * Simon Burge to support linking with the a.out format.
  33. *
  34. * 2000/09/15: Robert Leslie <[email protected]>
  35. * - Fixed a small bug where flags were changed before a conditional branch.
  36. *
  37. * 2000/09/15: Andre McCurdy <[email protected]>
  38. * - Applied Nicolas Pitre's rounding optimisation in all remaining places.
  39. *
  40. * 2000/09/09: Nicolas Pitre <[email protected]>
  41. * - Optimized rounding + scaling operations.
  42. *
  43. * 2000/08/09: Andre McCurdy <[email protected]>
  44. * - Original created.
  45. *
  46. ****************************************************************************/
  47. /*
  48. On entry:
  49. r0 = pointer to 18 element input array
  50. r1 = pointer to 36 element output array
  51. r2 = windowing block type
  52. Stack frame created during execution of the function:
  53. Initial Holds:
  54. Stack
  55. pointer
  56. minus:
  57. 0
  58. 4 lr
  59. 8 r11
  60. 12 r10
  61. 16 r9
  62. 20 r8
  63. 24 r7
  64. 28 r6
  65. 32 r5
  66. 36 r4
  67. 40 r2 : windowing block type
  68. 44 ct00 high
  69. 48 ct00 low
  70. 52 ct01 high
  71. 56 ct01 low
  72. 60 ct04 high
  73. 64 ct04 low
  74. 68 ct06 high
  75. 72 ct06 low
  76. 76 ct05 high
  77. 80 ct05 low
  78. 84 ct03 high
  79. 88 ct03 low
  80. 92 -ct05 high
  81. 96 -ct05 low
  82. 100 -ct07 high
  83. 104 -ct07 low
  84. 108 ct07 high
  85. 112 ct07 low
  86. 116 ct02 high
  87. 120 ct02 low
  88. */
  89. #define BLOCK_MODE_NORMAL 0
  90. #define BLOCK_MODE_START 1
  91. #define BLOCK_MODE_STOP 3
  92. #define X0 0x00
  93. #define X1 0x04
  94. #define X2 0x08
  95. #define X3 0x0C
  96. #define X4 0x10
  97. #define X5 0x14
  98. #define X6 0x18
  99. #define X7 0x1c
  100. #define X8 0x20
  101. #define X9 0x24
  102. #define X10 0x28
  103. #define X11 0x2c
  104. #define X12 0x30
  105. #define X13 0x34
  106. #define X14 0x38
  107. #define X15 0x3c
  108. #define X16 0x40
  109. #define X17 0x44
  110. #define x0 0x00
  111. #define x1 0x04
  112. #define x2 0x08
  113. #define x3 0x0C
  114. #define x4 0x10
  115. #define x5 0x14
  116. #define x6 0x18
  117. #define x7 0x1c
  118. #define x8 0x20
  119. #define x9 0x24
  120. #define x10 0x28
  121. #define x11 0x2c
  122. #define x12 0x30
  123. #define x13 0x34
  124. #define x14 0x38
  125. #define x15 0x3c
  126. #define x16 0x40
  127. #define x17 0x44
  128. #define x18 0x48
  129. #define x19 0x4c
  130. #define x20 0x50
  131. #define x21 0x54
  132. #define x22 0x58
  133. #define x23 0x5c
  134. #define x24 0x60
  135. #define x25 0x64
  136. #define x26 0x68
  137. #define x27 0x6c
  138. #define x28 0x70
  139. #define x29 0x74
  140. #define x30 0x78
  141. #define x31 0x7c
  142. #define x32 0x80
  143. #define x33 0x84
  144. #define x34 0x88
  145. #define x35 0x8c
  146. #define K00 0x0ffc19fd
  147. #define K01 0x00b2aa3e
  148. #define K02 0x0fdcf549
  149. #define K03 0x0216a2a2
  150. #define K04 0x0f9ee890
  151. #define K05 0x03768962
  152. #define K06 0x0f426cb5
  153. #define K07 0x04cfb0e2
  154. #define K08 0x0ec835e8
  155. #define K09 0x061f78aa
  156. #define K10 0x0e313245
  157. #define K11 0x07635284
  158. #define K12 0x0d7e8807
  159. #define K13 0x0898c779
  160. #define K14 0x0cb19346
  161. #define K15 0x09bd7ca0
  162. #define K16 0x0bcbe352
  163. #define K17 0x0acf37ad
  164. #define minus_K02 0xf0230ab7
  165. #define WL0 0x00b2aa3e
  166. #define WL1 0x0216a2a2
  167. #define WL2 0x03768962
  168. #define WL3 0x04cfb0e2
  169. #define WL4 0x061f78aa
  170. #define WL5 0x07635284
  171. #define WL6 0x0898c779
  172. #define WL7 0x09bd7ca0
  173. #define WL8 0x0acf37ad
  174. #define WL9 0x0bcbe352
  175. #define WL10 0x0cb19346
  176. #define WL11 0x0d7e8807
  177. #define WL12 0x0e313245
  178. #define WL13 0x0ec835e8
  179. #define WL14 0x0f426cb5
  180. #define WL15 0x0f9ee890
  181. #define WL16 0x0fdcf549
  182. #define WL17 0x0ffc19fd
  183. @*****************************************************************************
  184. .text
  185. .align
  186. .global III_imdct_l
  187. .global _III_imdct_l
  188. III_imdct_l:
  189. _III_imdct_l:
  190. stmdb sp!, { r2, r4 - r11, lr } @ all callee saved regs, plus arg3
  191. ldr r4, =K08 @ r4 = K08
  192. ldr r5, =K09 @ r5 = K09
  193. ldr r8, [r0, #X4] @ r8 = X4
  194. ldr r9, [r0, #X13] @ r9 = X13
  195. rsb r6, r4, #0 @ r6 = -K08
  196. rsb r7, r5, #0 @ r7 = -K09
  197. smull r2, r3, r4, r8 @ r2..r3 = (X4 * K08)
  198. smlal r2, r3, r5, r9 @ r2..r3 = (X4 * K08) + (X13 * K09) = ct01
  199. smull r10, lr, r8, r5 @ r10..lr = (X4 * K09)
  200. smlal r10, lr, r9, r6 @ r10..lr = (X4 * K09) + (X13 * -K08) = ct00
  201. ldr r8, [r0, #X7] @ r8 = X7
  202. ldr r9, [r0, #X16] @ r9 = X16
  203. stmdb sp!, { r2, r3, r10, lr } @ stack ct00_h, ct00_l, ct01_h, ct01_l
  204. add r8, r8, r9 @ r8 = (X7 + X16)
  205. ldr r9, [r0, #X1] @ r9 = X1
  206. smlal r2, r3, r6, r8 @ r2..r3 = ct01 + ((X7 + X16) * -K08)
  207. smlal r2, r3, r7, r9 @ r2..r3 += (X1 * -K09)
  208. ldr r7, [r0, #X10] @ r7 = X10
  209. rsbs r10, r10, #0
  210. rsc lr, lr, #0 @ r10..lr = -ct00
  211. smlal r2, r3, r5, r7 @ r2..r3 += (X10 * K09) = ct06
  212. smlal r10, lr, r9, r6 @ r10..lr = -ct00 + ( X1 * -K08)
  213. smlal r10, lr, r8, r5 @ r10..lr += ((X7 + X16) * K09)
  214. smlal r10, lr, r7, r4 @ r10..lr += ( X10 * K08) = ct04
  215. stmdb sp!, { r2, r3, r10, lr } @ stack ct04_h, ct04_l, ct06_h, ct06_l
  216. @----
  217. ldr r7, [r0, #X0]
  218. ldr r8, [r0, #X11]
  219. ldr r9, [r0, #X12]
  220. sub r7, r7, r8
  221. sub r7, r7, r9 @ r7 = (X0 - X11 -X12) = ct14
  222. ldr r9, [r0, #X3]
  223. ldr r8, [r0, #X8]
  224. ldr r11, [r0, #X15]
  225. sub r8, r8, r9
  226. add r8, r8, r11 @ r8 = (X8 - X3 + X15) = ct16
  227. add r11, r7, r8 @ r11 = ct14 + ct16 = ct18
  228. smlal r2, r3, r6, r11 @ r2..r3 = ct06 + ((X0 - X11 - X3 + X15 + X8 - X12) * -K08)
  229. ldr r6, [r0, #X2]
  230. ldr r9, [r0, #X9]
  231. ldr r12, [r0, #X14]
  232. sub r6, r6, r9
  233. sub r6, r6, r12 @ r6 = (X2 - X9 - X14) = ct15
  234. ldr r9, [r0, #X5]
  235. ldr r12, [r0, #X6]
  236. sub r9, r9, r12
  237. ldr r12, [r0, #X17]
  238. sub r9, r9, r12 @ r9 = (X5 - X6 - X17) = ct17
  239. add r12, r9, r6 @ r12 = ct15 + ct17 = ct19
  240. smlal r2, r3, r5, r12 @ r2..r3 += ((X2 - X9 + X5 - X6 - X17 - X14) * K09)
  241. smlal r10, lr, r11, r5 @ r10..lr = ct04 + (ct18 * K09)
  242. smlal r10, lr, r12, r4 @ r10..lr = ct04 + (ct18 * K09) + (ct19 * K08)
  243. movs r2, r2, lsr #28
  244. adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
  245. str r2, [r1, #x22] @ store result x22
  246. movs r10, r10, lsr #28
  247. adc r10, r10, lr, lsl #4 @ r10 = bits[59..28] of r10..lr
  248. str r10, [r1, #x4] @ store result x4
  249. @----
  250. ldmia sp, { r2, r3, r4, r5 } @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
  251. @ r2..r3 = ct06
  252. @ r4..r5 = ct04
  253. @ r6 = ct15
  254. @ r7 = ct14
  255. @ r8 = ct16
  256. @ r9 = ct17
  257. @ r10 = .
  258. @ r11 = .
  259. @ r12 = .
  260. @ lr = .
  261. ldr r10, =K03 @ r10 = K03
  262. ldr lr, =K15 @ lr = K15
  263. smlal r2, r3, r10, r7 @ r2..r3 = ct06 + (ct14 * K03)
  264. smlal r4, r5, lr, r7 @ r4..r5 = ct04 + (ct14 * K15)
  265. ldr r12, =K14 @ r12 = K14
  266. rsb r10, r10, #0 @ r10 = -K03
  267. smlal r2, r3, lr, r6 @ r2..r3 += (ct15 * K15)
  268. smlal r4, r5, r10, r6 @ r4..r5 += (ct15 * -K03)
  269. smlal r2, r3, r12, r8 @ r2..r3 += (ct16 * K14)
  270. ldr r11, =minus_K02 @ r11 = -K02
  271. rsb r12, r12, #0 @ r12 = -K14
  272. smlal r4, r5, r12, r9 @ r4..r5 += (ct17 * -K14)
  273. smlal r2, r3, r11, r9 @ r2..r3 += (ct17 * -K02)
  274. smlal r4, r5, r11, r8 @ r4..r5 += (ct16 * -K02)
  275. movs r2, r2, lsr #28
  276. adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
  277. str r2, [r1, #x7] @ store result x7
  278. movs r4, r4, lsr #28
  279. adc r4, r4, r5, lsl #4 @ r4 = bits[59..28] of r4..r5
  280. str r4, [r1, #x1] @ store result x1
  281. @----
  282. ldmia sp, { r2, r3, r4, r5 } @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
  283. @ r2..r3 = ct06
  284. @ r4..r5 = ct04
  285. @ r6 = ct15
  286. @ r7 = ct14
  287. @ r8 = ct16
  288. @ r9 = ct17
  289. @ r10 = -K03
  290. @ r11 = -K02
  291. @ r12 = -K14
  292. @ lr = K15
  293. rsbs r2, r2, #0
  294. rsc r3, r3, #0 @ r2..r3 = -ct06
  295. smlal r2, r3, r12, r7 @ r2..r3 = -ct06 + (ct14 * -K14)
  296. smlal r2, r3, r10, r8 @ r2..r3 += (ct16 * -K03)
  297. smlal r4, r5, r12, r6 @ r4..r5 = ct04 + (ct15 * -K14)
  298. smlal r4, r5, r10, r9 @ r4..r5 += (ct17 * -K03)
  299. smlal r4, r5, lr, r8 @ r4..r5 += (ct16 * K15)
  300. smlal r4, r5, r11, r7 @ r4..r5 += (ct14 * -K02)
  301. rsb lr, lr, #0 @ lr = -K15
  302. rsb r11, r11, #0 @ r11 = K02
  303. smlal r2, r3, lr, r9 @ r2..r3 += (ct17 * -K15)
  304. smlal r2, r3, r11, r6 @ r2..r3 += (ct15 * K02)
  305. movs r4, r4, lsr #28
  306. adc r4, r4, r5, lsl #4 @ r4 = bits[59..28] of r4..r5
  307. str r4, [r1, #x25] @ store result x25
  308. movs r2, r2, lsr #28
  309. adc r2, r2, r3, lsl #4 @ r2 = bits[59..28] of r2..r3
  310. str r2, [r1, #x19] @ store result x19
  311. @----
  312. ldr r2, [sp, #16] @ r2 = ct01_l
  313. ldr r3, [sp, #20] @ r3 = ct01_h
  314. ldr r6, [r0, #X1]
  315. ldr r8, [r0, #X7]
  316. ldr r9, [r0, #X10]
  317. ldr r7, [r0, #X16]
  318. rsbs r2, r2, #0
  319. rsc r3, r3, #0 @ r2..r3 = -ct01
  320. mov r4, r2
  321. mov r5, r3 @ r4..r5 = -ct01
  322. @ r2..r3 = -ct01
  323. @ r4..r5 = -ct01
  324. @ r6 = X1
  325. @ r7 = X16
  326. @ r8 = X7
  327. @ r9 = X10
  328. @ r10 = -K03
  329. @ r11 = K02
  330. @ r12 = -K14
  331. @ lr = -K15
  332. smlal r4, r5, r12, r7 @ r4..r5 = -ct01 + (X16 * -K14)
  333. smlal r2, r3, lr, r9 @ r2..r3 = -ct01 + (X10 * -K15)
  334. smlal r4, r5, r10, r8 @ r4..r5 += (X7 * -K03)
  335. smlal r2, r3, r10, r7 @ r2..r3 += (X16 * -K03)
  336. smlal r4, r5, r11, r9 @ r4..r5 += (X10 * K02)
  337. smlal r2, r3, r12, r8 @ r2..r3 += (X7 * -K14)
  338. rsb lr, lr, #0 @ lr = K15
  339. rsb r11, r11, #0 @ r11 = -K02
  340. smlal r4, r5, lr, r6 @ r4..r5 += (X1 * K15) = ct05
  341. smlal r2, r3, r11, r6 @ r2..r3 += (X1 * -K02) = ct03
  342. stmdb sp!, { r2, r3, r4, r5 } @ stack ct05_h, ct05_l, ct03_h, ct03_l
  343. rsbs r4, r4, #0
  344. rsc r5, r5, #0 @ r4..r5 = -ct05
  345. stmdb sp!, { r4, r5 } @ stack -ct05_h, -ct05_l
  346. ldr r2, [sp, #48] @ r2 = ct00_l
  347. ldr r3, [sp, #52] @ r3 = ct00_h
  348. rsb r10, r10, #0 @ r10 = K03
  349. rsbs r4, r2, #0
  350. rsc r5, r3, #0 @ r4..r5 = -ct00
  351. @ r2..r3 = ct00
  352. @ r4..r5 = -ct00
  353. @ r6 = X1
  354. @ r7 = X16
  355. @ r8 = X7
  356. @ r9 = X10
  357. @ r10 = K03
  358. @ r11 = -K02
  359. @ r12 = -K14
  360. @ lr = K15
  361. smlal r4, r5, r10, r6 @ r4..r5 = -ct00 + (X1 * K03)
  362. smlal r2, r3, r10, r9 @ r2..r3 = ct00 + (X10 * K03)
  363. smlal r4, r5, r12, r9 @ r4..r5 += (X10 * -K14)
  364. smlal r2, r3, r12, r6 @ r2..r3 += (X1 * -K14)
  365. smlal r4, r5, r11, r7 @ r4..r5 += (X16 * -K02)
  366. smlal r4, r5, lr, r8 @ r4..r5 += (X7 * K15) = ct07
  367. rsb lr, lr, #0 @ lr = -K15
  368. rsb r11, r11, #0 @ r11 = K02
  369. smlal r2, r3, r11, r8 @ r2..r3 += (X7 * K02)
  370. smlal r2, r3, lr, r7 @ r2..r3 += (X16 * -K15) = ct02
  371. rsbs r6, r4, #0
  372. rsc r7, r5, #0 @ r6..r7 = -ct07
  373. stmdb sp!, { r2 - r7 } @ stack -ct07_h, -ct07_l, ct07_h, ct07_l, ct02_h, ct02_l
  374. @----
  375. add r2, pc, #(imdct36_long_karray-.-8) @ r2 = base address of Knn array (PIC safe ?)
  376. loop:
  377. ldr r12, [r0, #X0]
  378. ldmia r2!, { r5 - r11 } @ first 7 words from Karray element
  379. smull r3, r4, r5, r12 @ sum = (Kxx * X0)
  380. ldr r12, [r0, #X2]
  381. ldr r5, [r0, #X3]
  382. smlal r3, r4, r6, r12 @ sum += (Kxx * X2)
  383. ldr r12, [r0, #X5]
  384. ldr r6, [r0, #X6]
  385. smlal r3, r4, r7, r5 @ sum += (Kxx * X3)
  386. smlal r3, r4, r8, r12 @ sum += (Kxx * X5)
  387. ldr r12, [r0, #X8]
  388. ldr r5, [r0, #X9]
  389. smlal r3, r4, r9, r6 @ sum += (Kxx * X6)
  390. smlal r3, r4, r10, r12 @ sum += (Kxx * X8)
  391. smlal r3, r4, r11, r5 @ sum += (Kxx * X9)
  392. ldmia r2!, { r5 - r10 } @ final 6 words from Karray element
  393. ldr r11, [r0, #X11]
  394. ldr r12, [r0, #X12]
  395. smlal r3, r4, r5, r11 @ sum += (Kxx * X11)
  396. ldr r11, [r0, #X14]
  397. ldr r5, [r0, #X15]
  398. smlal r3, r4, r6, r12 @ sum += (Kxx * X12)
  399. smlal r3, r4, r7, r11 @ sum += (Kxx * X14)
  400. ldr r11, [r0, #X17]
  401. smlal r3, r4, r8, r5 @ sum += (Kxx * X15)
  402. smlal r3, r4, r9, r11 @ sum += (Kxx * X17)
  403. add r5, sp, r10, lsr #16 @ create index back into stack for required ctxx
  404. ldmia r5, { r6, r7 } @ r6..r7 = ctxx
  405. mov r8, r10, lsl #16 @ push ctxx index off the top end
  406. adds r3, r3, r6 @ add low words
  407. adc r4, r4, r7 @ add high words, with carry
  408. movs r3, r3, lsr #28
  409. adc r3, r3, r4, lsl #4 @ r3 = bits[59..28] of r3..r4
  410. str r3, [r1, r8, lsr #24] @ push completion flag off the bottom end
  411. movs r8, r8, lsl #8 @ push result location index off the top end
  412. beq loop @ loop back if completion flag not set
  413. b imdct_l_windowing @ branch to windowing stage if looping finished
  414. imdct36_long_karray:
  415. .word K17, -K13, K10, -K06, -K05, K01, -K00, K04, -K07, K11, K12, -K16, 0x00000000
  416. .word K13, K07, K16, K01, K10, -K05, K04, -K11, K00, -K17, K06, -K12, 0x00200800
  417. .word K11, K17, K05, K12, -K01, K06, -K07, K00, -K13, K04, -K16, K10, 0x00200c00
  418. .word K07, K00, -K12, K05, -K16, -K10, K11, -K17, K04, K13, K01, K06, 0x00001400
  419. .word K05, K10, -K00, -K17, K07, -K13, K12, K06, -K16, K01, -K11, -K04, 0x00181800
  420. .word K01, K05, -K07, -K11, K13, K17, -K16, -K12, K10, K06, -K04, -K00, 0x00102000
  421. .word -K16, K12, -K11, K07, K04, -K00, -K01, K05, -K06, K10, K13, -K17, 0x00284800
  422. .word -K12, K06, K17, -K00, -K11, K04, K05, -K10, K01, K16, -K07, -K13, 0x00085000
  423. .word -K10, K16, K04, -K13, -K00, K07, K06, -K01, -K12, -K05, K17, K11, 0x00105400
  424. .word -K06, -K01, K13, K04, K17, -K11, -K10, -K16, -K05, K12, K00, K07, 0x00185c00
  425. .word -K04, -K11, -K01, K16, K06, K12, K13, -K07, -K17, -K00, -K10, -K05, 0x00006000
  426. .word -K00, -K04, -K06, -K10, -K12, -K16, -K17, -K13, -K11, -K07, -K05, -K01, 0x00206801
  427. @----
  428. @-------------------------------------------------------------------------
  429. @----
  430. imdct_l_windowing:
  431. ldr r11, [sp, #80] @ fetch function parameter 3 from out of the stack
  432. ldmia r1!, { r0, r2 - r9 } @ load 9 words from x0, update pointer
  433. @ r0 = x0
  434. @ r1 = &x[9]
  435. @ r2 = x1
  436. @ r3 = x2
  437. @ r4 = x3
  438. @ r5 = x4
  439. @ r6 = x5
  440. @ r7 = x6
  441. @ r8 = x7
  442. @ r9 = x8
  443. @ r10 = .
  444. @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
  445. @ r12 = .
  446. @ lr = .
  447. cmp r11, #BLOCK_MODE_STOP @ setup flags
  448. rsb r10, r0, #0 @ r10 = -x0 (DONT change flags !!)
  449. beq stop_block_x0_to_x17
  450. @ start and normal blocks are treated the same for x[0]..x[17]
  451. normal_block_x0_to_x17:
  452. ldr r12, =WL9 @ r12 = window_l[9]
  453. rsb r0, r9, #0 @ r0 = -x8
  454. rsb r9, r2, #0 @ r9 = -x1
  455. rsb r2, r8, #0 @ r2 = -x7
  456. rsb r8, r3, #0 @ r8 = -x2
  457. rsb r3, r7, #0 @ r3 = -x6
  458. rsb r7, r4, #0 @ r7 = -x3
  459. rsb r4, r6, #0 @ r4 = -x5
  460. rsb r6, r5, #0 @ r6 = -x4
  461. @ r0 = -x8
  462. @ r1 = &x[9]
  463. @ r2 = -x7
  464. @ r3 = -x6
  465. @ r4 = -x5
  466. @ r5 = .
  467. @ r6 = -x4
  468. @ r7 = -x3
  469. @ r8 = -x2
  470. @ r9 = -x1
  471. @ r10 = -x0
  472. @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
  473. @ r12 = window_l[9]
  474. @ lr = .
  475. smull r5, lr, r12, r0 @ r5..lr = (window_l[9] * (x[9] == -x[8]))
  476. ldr r12, =WL10 @ r12 = window_l[10]
  477. movs r5, r5, lsr #28
  478. adc r0, r5, lr, lsl #4 @ r0 = bits[59..28] of windowed x9
  479. smull r5, lr, r12, r2 @ r5..lr = (window_l[10] * (x[10] == -x[7]))
  480. ldr r12, =WL11 @ r12 = window_l[11]
  481. movs r5, r5, lsr #28
  482. adc r2, r5, lr, lsl #4 @ r2 = bits[59..28] of windowed x10
  483. smull r5, lr, r12, r3 @ r5..lr = (window_l[11] * (x[11] == -x[6]))
  484. ldr r12, =WL12 @ r12 = window_l[12]
  485. movs r5, r5, lsr #28
  486. adc r3, r5, lr, lsl #4 @ r3 = bits[59..28] of windowed x11
  487. smull r5, lr, r12, r4 @ r5..lr = (window_l[12] * (x[12] == -x[5]))
  488. ldr r12, =WL13 @ r12 = window_l[13]
  489. movs r5, r5, lsr #28
  490. adc r4, r5, lr, lsl #4 @ r4 = bits[59..28] of windowed x12
  491. smull r5, lr, r12, r6 @ r5..lr = (window_l[13] * (x[13] == -x[4]))
  492. ldr r12, =WL14 @ r12 = window_l[14]
  493. movs r5, r5, lsr #28
  494. adc r6, r5, lr, lsl #4 @ r6 = bits[59..28] of windowed x13
  495. smull r5, lr, r12, r7 @ r5..lr = (window_l[14] * (x[14] == -x[3]))
  496. ldr r12, =WL15 @ r12 = window_l[15]
  497. movs r5, r5, lsr #28
  498. adc r7, r5, lr, lsl #4 @ r7 = bits[59..28] of windowed x14
  499. smull r5, lr, r12, r8 @ r5..lr = (window_l[15] * (x[15] == -x[2]))
  500. ldr r12, =WL16 @ r12 = window_l[16]
  501. movs r5, r5, lsr #28
  502. adc r8, r5, lr, lsl #4 @ r8 = bits[59..28] of windowed x15
  503. smull r5, lr, r12, r9 @ r5..lr = (window_l[16] * (x[16] == -x[1]))
  504. ldr r12, =WL17 @ r12 = window_l[17]
  505. movs r5, r5, lsr #28
  506. adc r9, r5, lr, lsl #4 @ r9 = bits[59..28] of windowed x16
  507. smull r5, lr, r12, r10 @ r5..lr = (window_l[17] * (x[17] == -x[0]))
  508. ldr r12, =WL0 @ r12 = window_l[0]
  509. movs r5, r5, lsr #28
  510. adc r10, r5, lr, lsl #4 @ r10 = bits[59..28] of windowed x17
  511. stmia r1, { r0, r2 - r4, r6 - r10 } @ store windowed x[9] .. x[17]
  512. ldmdb r1!, { r0, r2 - r9 } @ load 9 words downto (and including) x0
  513. smull r10, lr, r12, r0 @ r10..lr = (window_l[0] * x[0])
  514. ldr r12, =WL1 @ r12 = window_l[1]
  515. movs r10, r10, lsr #28
  516. adc r0, r10, lr, lsl #4 @ r0 = bits[59..28] of windowed x0
  517. smull r10, lr, r12, r2 @ r10..lr = (window_l[1] * x[1])
  518. ldr r12, =WL2 @ r12 = window_l[2]
  519. movs r10, r10, lsr #28
  520. adc r2, r10, lr, lsl #4 @ r2 = bits[59..28] of windowed x1
  521. smull r10, lr, r12, r3 @ r10..lr = (window_l[2] * x[2])
  522. ldr r12, =WL3 @ r12 = window_l[3]
  523. movs r10, r10, lsr #28
  524. adc r3, r10, lr, lsl #4 @ r3 = bits[59..28] of windowed x2
  525. smull r10, lr, r12, r4 @ r10..lr = (window_l[3] * x[3])
  526. ldr r12, =WL4 @ r12 = window_l[4]
  527. movs r10, r10, lsr #28
  528. adc r4, r10, lr, lsl #4 @ r4 = bits[59..28] of windowed x3
  529. smull r10, lr, r12, r5 @ r10..lr = (window_l[4] * x[4])
  530. ldr r12, =WL5 @ r12 = window_l[5]
  531. movs r10, r10, lsr #28
  532. adc r5, r10, lr, lsl #4 @ r5 = bits[59..28] of windowed x4
  533. smull r10, lr, r12, r6 @ r10..lr = (window_l[5] * x[5])
  534. ldr r12, =WL6 @ r12 = window_l[6]
  535. movs r10, r10, lsr #28
  536. adc r6, r10, lr, lsl #4 @ r6 = bits[59..28] of windowed x5
  537. smull r10, lr, r12, r7 @ r10..lr = (window_l[6] * x[6])
  538. ldr r12, =WL7 @ r12 = window_l[7]
  539. movs r10, r10, lsr #28
  540. adc r7, r10, lr, lsl #4 @ r7 = bits[59..28] of windowed x6
  541. smull r10, lr, r12, r8 @ r10..lr = (window_l[7] * x[7])
  542. ldr r12, =WL8 @ r12 = window_l[8]
  543. movs r10, r10, lsr #28
  544. adc r8, r10, lr, lsl #4 @ r8 = bits[59..28] of windowed x7
  545. smull r10, lr, r12, r9 @ r10..lr = (window_l[8] * x[8])
  546. movs r10, r10, lsr #28
  547. adc r9, r10, lr, lsl #4 @ r9 = bits[59..28] of windowed x8
  548. stmia r1, { r0, r2 - r9 } @ store windowed x[0] .. x[8]
  549. cmp r11, #BLOCK_MODE_START
  550. beq start_block_x18_to_x35
  551. @----
  552. normal_block_x18_to_x35:
  553. ldr r11, =WL3 @ r11 = window_l[3]
  554. ldr r12, =WL4 @ r12 = window_l[4]
  555. add r1, r1, #(18*4) @ r1 = &x[18]
  556. ldmia r1!, { r0, r2 - r4, r6 - r10 } @ load 9 words from x18, update pointer
  557. @ r0 = x18
  558. @ r1 = &x[27]
  559. @ r2 = x19
  560. @ r3 = x20
  561. @ r4 = x21
  562. @ r5 = .
  563. @ r6 = x22
  564. @ r7 = x23
  565. @ r8 = x24
  566. @ r9 = x25
  567. @ r10 = x26
  568. @ r11 = window_l[3]
  569. @ r12 = window_l[4]
  570. @ lr = .
  571. smull r5, lr, r12, r6 @ r5..lr = (window_l[4] * (x[22] == x[31]))
  572. movs r5, r5, lsr #28
  573. adc r5, r5, lr, lsl #4 @ r5 = bits[59..28] of windowed x31
  574. smull r6, lr, r11, r4 @ r5..lr = (window_l[3] * (x[21] == x[32]))
  575. ldr r12, =WL5 @ r12 = window_l[5]
  576. movs r6, r6, lsr #28
  577. adc r6, r6, lr, lsl #4 @ r6 = bits[59..28] of windowed x32
  578. smull r4, lr, r12, r7 @ r4..lr = (window_l[5] * (x[23] == x[30]))
  579. ldr r11, =WL1 @ r11 = window_l[1]
  580. ldr r12, =WL2 @ r12 = window_l[2]
  581. movs r4, r4, lsr #28
  582. adc r4, r4, lr, lsl #4 @ r4 = bits[59..28] of windowed x30
  583. smull r7, lr, r12, r3 @ r7..lr = (window_l[2] * (x[20] == x[33]))
  584. ldr r12, =WL6 @ r12 = window_l[6]
  585. movs r7, r7, lsr #28
  586. adc r7, r7, lr, lsl #4 @ r7 = bits[59..28] of windowed x33
  587. smull r3, lr, r12, r8 @ r3..lr = (window_l[6] * (x[24] == x[29]))
  588. movs r3, r3, lsr #28
  589. adc r3, r3, lr, lsl #4 @ r3 = bits[59..28] of windowed x29
  590. smull r8, lr, r11, r2 @ r7..lr = (window_l[1] * (x[19] == x[34]))
  591. ldr r12, =WL7 @ r12 = window_l[7]
  592. ldr r11, =WL8 @ r11 = window_l[8]
  593. movs r8, r8, lsr #28
  594. adc r8, r8, lr, lsl #4 @ r8 = bits[59..28] of windowed x34
  595. smull r2, lr, r12, r9 @ r7..lr = (window_l[7] * (x[25] == x[28]))
  596. ldr r12, =WL0 @ r12 = window_l[0]
  597. movs r2, r2, lsr #28
  598. adc r2, r2, lr, lsl #4 @ r2 = bits[59..28] of windowed x28
  599. smull r9, lr, r12, r0 @ r3..lr = (window_l[0] * (x[18] == x[35]))
  600. movs r9, r9, lsr #28
  601. adc r9, r9, lr, lsl #4 @ r9 = bits[59..28] of windowed x35
  602. smull r0, lr, r11, r10 @ r7..lr = (window_l[8] * (x[26] == x[27]))
  603. ldr r11, =WL16 @ r11 = window_l[16]
  604. ldr r12, =WL17 @ r12 = window_l[17]
  605. movs r0, r0, lsr #28
  606. adc r0, r0, lr, lsl #4 @ r0 = bits[59..28] of windowed x27
  607. stmia r1, { r0, r2 - r9 } @ store windowed x[27] .. x[35]
  608. ldmdb r1!, { r0, r2 - r9 } @ load 9 words downto (and including) x18
  609. smull r10, lr, r12, r0 @ r10..lr = (window_l[17] * x[18])
  610. movs r10, r10, lsr #28
  611. adc r0, r10, lr, lsl #4 @ r0 = bits[59..28] of windowed x0
  612. smull r10, lr, r11, r2 @ r10..lr = (window_l[16] * x[19])
  613. ldr r11, =WL14 @ r11 = window_l[14]
  614. ldr r12, =WL15 @ r12 = window_l[15]
  615. movs r10, r10, lsr #28
  616. adc r2, r10, lr, lsl #4 @ r2 = bits[59..28] of windowed x1
  617. smull r10, lr, r12, r3 @ r10..lr = (window_l[15] * x[20])
  618. movs r10, r10, lsr #28
  619. adc r3, r10, lr, lsl #4 @ r3 = bits[59..28] of windowed x2
  620. smull r10, lr, r11, r4 @ r10..lr = (window_l[14] * x[21])
  621. ldr r11, =WL12 @ r11 = window_l[12]
  622. ldr r12, =WL13 @ r12 = window_l[13]
  623. movs r10, r10, lsr #28
  624. adc r4, r10, lr, lsl #4 @ r4 = bits[59..28] of windowed x3
  625. smull r10, lr, r12, r5 @ r10..lr = (window_l[13] * x[22])
  626. movs r10, r10, lsr #28
  627. adc r5, r10, lr, lsl #4 @ r5 = bits[59..28] of windowed x4
  628. smull r10, lr, r11, r6 @ r10..lr = (window_l[12] * x[23])
  629. ldr r11, =WL10 @ r12 = window_l[10]
  630. ldr r12, =WL11 @ r12 = window_l[11]
  631. movs r10, r10, lsr #28
  632. adc r6, r10, lr, lsl #4 @ r6 = bits[59..28] of windowed x5
  633. smull r10, lr, r12, r7 @ r10..lr = (window_l[11] * x[24])
  634. movs r10, r10, lsr #28
  635. adc r7, r10, lr, lsl #4 @ r7 = bits[59..28] of windowed x6
  636. smull r10, lr, r11, r8 @ r10..lr = (window_l[10] * x[25])
  637. ldr r12, =WL9 @ r12 = window_l[9]
  638. movs r10, r10, lsr #28
  639. adc r8, r10, lr, lsl #4 @ r8 = bits[59..28] of windowed x7
  640. smull r10, lr, r12, r9 @ r10..lr = (window_l[9] * x[26])
  641. movs r10, r10, lsr #28
  642. adc r9, r10, lr, lsl #4 @ r9 = bits[59..28] of windowed x8
  643. stmia r1, { r0, r2 - r9 } @ store windowed x[18] .. x[26]
  644. @----
  645. @ NB there are 2 possible exits from this function - this is only one of them
  646. @----
  647. add sp, sp, #(21*4) @ return stack frame
  648. ldmia sp!, { r4 - r11, pc } @ restore callee saved regs, and return
  649. @----
  650. stop_block_x0_to_x17:
  651. @ r0 = x0
  652. @ r1 = &x[9]
  653. @ r2 = x1
  654. @ r3 = x2
  655. @ r4 = x3
  656. @ r5 = x4
  657. @ r6 = x5
  658. @ r7 = x6
  659. @ r8 = x7
  660. @ r9 = x8
  661. @ r10 = -x0
  662. @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
  663. @ r12 = .
  664. @ lr = .
  665. rsb r0, r6, #0 @ r0 = -x5
  666. rsb r6, r2, #0 @ r6 = -x1
  667. rsb r2, r5, #0 @ r2 = -x4
  668. rsb r5, r3, #0 @ r5 = -x2
  669. rsb r3, r4, #0 @ r3 = -x3
  670. add r1, r1, #(3*4) @ r1 = &x[12]
  671. stmia r1, { r0, r2, r3, r5, r6, r10 } @ store unchanged x[12] .. x[17]
  672. ldr r0, =WL1 @ r0 = window_l[1] == window_s[0]
  673. rsb r10, r9, #0 @ r10 = -x8
  674. rsb r12, r8, #0 @ r12 = -x7
  675. rsb lr, r7, #0 @ lr = -x6
  676. @ r0 = WL1
  677. @ r1 = &x[12]
  678. @ r2 = .
  679. @ r3 = .
  680. @ r4 = .
  681. @ r5 = .
  682. @ r6 = .
  683. @ r7 = x6
  684. @ r8 = x7
  685. @ r9 = x8
  686. @ r10 = -x8
  687. @ r11 = window mode: (0 == normal), (1 == start block), (3 == stop block)
  688. @ r12 = -x7
  689. @ lr = -x6
  690. smull r5, r6, r0, r7 @ r5..r6 = (window_l[1] * x[6])
  691. ldr r2, =WL4 @ r2 = window_l[4] == window_s[1]
  692. movs r5, r5, lsr #28
  693. adc r7, r5, r6, lsl #4 @ r7 = bits[59..28] of windowed x6
  694. smull r5, r6, r2, r8 @ r5..r6 = (window_l[4] * x[7])
  695. ldr r3, =WL7 @ r3 = window_l[7] == window_s[2]
  696. movs r5, r5, lsr #28
  697. adc r8, r5, r6, lsl #4 @ r8 = bits[59..28] of windowed x7
  698. smull r5, r6, r3, r9 @ r5..r6 = (window_l[7] * x[8])
  699. ldr r4, =WL10 @ r4 = window_l[10] == window_s[3]
  700. movs r5, r5, lsr #28
  701. adc r9, r5, r6, lsl #4 @ r9 = bits[59..28] of windowed x8
  702. smull r5, r6, r4, r10 @ r5..r6 = (window_l[10] * (x[9] == -x[8]))
  703. ldr r0, =WL13 @ r0 = window_l[13] == window_s[4]
  704. movs r5, r5, lsr #28
  705. adc r10, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
  706. smull r5, r6, r0, r12 @ r5..r6 = (window_l[13] * (x[10] == -x[7]))
  707. ldr r2, =WL16 @ r2 = window_l[16] == window_s[5]
  708. movs r5, r5, lsr #28
  709. adc r12, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
  710. smull r5, r6, r2, lr @ r5..r6 = (window_l[16] * (x[11] == -x[6]))
  711. ldr r0, =0x00
  712. movs r5, r5, lsr #28
  713. adc lr, r5, r6, lsl #4 @ r10 = bits[59..28] of windowed x9
  714. stmdb r1!, { r7 - r10, r12, lr } @ store windowed x[6] .. x[11]
  715. ldr r5, =0x00
  716. ldr r6, =0x00
  717. ldr r2, =0x00
  718. ldr r3, =0x00
  719. ldr r4, =0x00
  720. stmdb r1!, { r0, r2 - r6 } @ store windowed x[0] .. x[5]
  721. b normal_block_x18_to_x35
  722. @----
  723. start_block_x18_to_x35:
  724. ldr r4, =WL1 @ r0 = window_l[1] == window_s[0]
  725. add r1, r1, #(24*4) @ r1 = &x[24]
  726. ldmia r1, { r0, r2, r3 } @ load 3 words from x24, dont update pointer
  727. @ r0 = x24
  728. @ r1 = &x[24]
  729. @ r2 = x25
  730. @ r3 = x26
  731. @ r4 = WL1
  732. @ r5 = WL4
  733. @ r6 = WL7
  734. @ r7 = WL10
  735. @ r8 = WL13
  736. @ r9 = WL16
  737. @ r10 = .
  738. @ r11 = .
  739. @ r12 = .
  740. @ lr = .
  741. ldr r5, =WL4 @ r5 = window_l[4] == window_s[1]
  742. smull r10, r11, r4, r0 @ r10..r11 = (window_l[1] * (x[24] == x[29]))
  743. ldr r6, =WL7 @ r6 = window_l[7] == window_s[2]
  744. movs r10, r10, lsr #28
  745. adc lr, r10, r11, lsl #4 @ lr = bits[59..28] of windowed x29
  746. smull r10, r11, r5, r2 @ r10..r11 = (window_l[4] * (x[25] == x[28]))
  747. ldr r7, =WL10 @ r7 = window_l[10] == window_s[3]
  748. movs r10, r10, lsr #28
  749. adc r12, r10, r11, lsl #4 @ r12 = bits[59..28] of windowed x28
  750. smull r10, r11, r6, r3 @ r10..r11 = (window_l[7] * (x[26] == x[27]))
  751. ldr r8, =WL13 @ r8 = window_l[13] == window_s[4]
  752. movs r10, r10, lsr #28
  753. adc r4, r10, r11, lsl #4 @ r4 = bits[59..28] of windowed x27
  754. smull r10, r11, r7, r3 @ r10..r11 = (window_l[10] * x[26])
  755. ldr r9, =WL16 @ r9 = window_l[16] == window_s[5]
  756. movs r10, r10, lsr #28
  757. adc r3, r10, r11, lsl #4 @ r3 = bits[59..28] of windowed x26
  758. smull r10, r11, r8, r2 @ r10..r11 = (window_l[13] * x[25])
  759. ldr r5, =0x00
  760. movs r10, r10, lsr #28
  761. adc r2, r10, r11, lsl #4 @ r2 = bits[59..28] of windowed x25
  762. smull r10, r11, r9, r0 @ r10..r11 = (window_l[16] * x[24])
  763. ldr r6, =0x00
  764. movs r10, r10, lsr #28
  765. adc r0, r10, r11, lsl #4 @ r0 = bits[59..28] of windowed x24
  766. stmia r1!, { r0, r2, r3, r4, r12, lr } @ store windowed x[24] .. x[29]
  767. ldr r7, =0x00
  768. ldr r8, =0x00
  769. ldr r9, =0x00
  770. ldr r10, =0x00
  771. stmia r1!, { r5 - r10 } @ store windowed x[30] .. x[35]
  772. @----
  773. @ NB there are 2 possible exits from this function - this is only one of them
  774. @----
  775. add sp, sp, #(21*4) @ return stack frame
  776. ldmia sp!, { r4 - r11, pc } @ restore callee saved regs, and return
  777. @----
  778. @END
  779. @----