Counter Strike : Global Offensive Source Code
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1565 lines
36 KiB

  1. include ksamd64.inc
  2. EXTERNDEF s_sosemanukMulTables:FAR
  3. .CODE
  4. ALIGN 8
  5. Salsa20_OperateKeystream PROC FRAME
  6. mov r10, [rsp + 5*8]
  7. alloc_stack(10*16 + 32*16 + 8)
  8. save_xmm128 xmm6, 0200h
  9. save_xmm128 xmm7, 0210h
  10. save_xmm128 xmm8, 0220h
  11. save_xmm128 xmm9, 0230h
  12. save_xmm128 xmm10, 0240h
  13. save_xmm128 xmm11, 0250h
  14. save_xmm128 xmm12, 0260h
  15. save_xmm128 xmm13, 0270h
  16. save_xmm128 xmm14, 0280h
  17. save_xmm128 xmm15, 0290h
  18. .endprolog
  19. cmp r8, 4
  20. jl label5
  21. movdqa xmm0, [r10 + 0*16]
  22. movdqa xmm1, [r10 + 1*16]
  23. movdqa xmm2, [r10 + 2*16]
  24. movdqa xmm3, [r10 + 3*16]
  25. pshufd xmm4, xmm0, 0*64+0*16+0*4+0
  26. movdqa [rsp + (0*4+0)*16 + 256], xmm4
  27. pshufd xmm4, xmm0, 1*64+1*16+1*4+1
  28. movdqa [rsp + (0*4+1)*16 + 256], xmm4
  29. pshufd xmm4, xmm0, 2*64+2*16+2*4+2
  30. movdqa [rsp + (0*4+2)*16 + 256], xmm4
  31. pshufd xmm4, xmm0, 3*64+3*16+3*4+3
  32. movdqa [rsp + (0*4+3)*16 + 256], xmm4
  33. pshufd xmm4, xmm1, 0*64+0*16+0*4+0
  34. movdqa [rsp + (1*4+0)*16 + 256], xmm4
  35. pshufd xmm4, xmm1, 2*64+2*16+2*4+2
  36. movdqa [rsp + (1*4+2)*16 + 256], xmm4
  37. pshufd xmm4, xmm1, 3*64+3*16+3*4+3
  38. movdqa [rsp + (1*4+3)*16 + 256], xmm4
  39. pshufd xmm4, xmm2, 1*64+1*16+1*4+1
  40. movdqa [rsp + (2*4+1)*16 + 256], xmm4
  41. pshufd xmm4, xmm2, 2*64+2*16+2*4+2
  42. movdqa [rsp + (2*4+2)*16 + 256], xmm4
  43. pshufd xmm4, xmm2, 3*64+3*16+3*4+3
  44. movdqa [rsp + (2*4+3)*16 + 256], xmm4
  45. pshufd xmm4, xmm3, 0*64+0*16+0*4+0
  46. movdqa [rsp + (3*4+0)*16 + 256], xmm4
  47. pshufd xmm4, xmm3, 1*64+1*16+1*4+1
  48. movdqa [rsp + (3*4+1)*16 + 256], xmm4
  49. pshufd xmm4, xmm3, 2*64+2*16+2*4+2
  50. movdqa [rsp + (3*4+2)*16 + 256], xmm4
  51. pshufd xmm4, xmm3, 3*64+3*16+3*4+3
  52. movdqa [rsp + (3*4+3)*16 + 256], xmm4
  53. label1:
  54. mov eax, dword ptr [r10 + 8*4]
  55. mov r11d, dword ptr [r10 + 5*4]
  56. mov dword ptr [rsp + 8*16 + 0*4 + 256], eax
  57. mov dword ptr [rsp + 5*16 + 0*4 + 256], r11d
  58. add eax, 1
  59. adc r11d, 0
  60. mov dword ptr [rsp + 8*16 + 1*4 + 256], eax
  61. mov dword ptr [rsp + 5*16 + 1*4 + 256], r11d
  62. add eax, 1
  63. adc r11d, 0
  64. mov dword ptr [rsp + 8*16 + 2*4 + 256], eax
  65. mov dword ptr [rsp + 5*16 + 2*4 + 256], r11d
  66. add eax, 1
  67. adc r11d, 0
  68. mov dword ptr [rsp + 8*16 + 3*4 + 256], eax
  69. mov dword ptr [rsp + 5*16 + 3*4 + 256], r11d
  70. add eax, 1
  71. adc r11d, 0
  72. mov dword ptr [r10 + 8*4], eax
  73. mov dword ptr [r10 + 5*4], r11d
  74. movdqa xmm0, [rsp + 12*16 + 1*256]
  75. movdqa xmm4, [rsp + 13*16 + 1*256]
  76. movdqa xmm8, [rsp + 14*16 + 1*256]
  77. movdqa xmm12, [rsp + 15*16 + 1*256]
  78. movdqa xmm2, [rsp + 0*16 + 1*256]
  79. movdqa xmm6, [rsp + 1*16 + 1*256]
  80. movdqa xmm10, [rsp + 2*16 + 1*256]
  81. movdqa xmm14, [rsp + 3*16 + 1*256]
  82. paddd xmm0, xmm2
  83. paddd xmm4, xmm6
  84. paddd xmm8, xmm10
  85. paddd xmm12, xmm14
  86. movdqa xmm1, xmm0
  87. movdqa xmm5, xmm4
  88. movdqa xmm9, xmm8
  89. movdqa xmm13, xmm12
  90. pslld xmm0, 7
  91. pslld xmm4, 7
  92. pslld xmm8, 7
  93. pslld xmm12, 7
  94. psrld xmm1, 32-7
  95. psrld xmm5, 32-7
  96. psrld xmm9, 32-7
  97. psrld xmm13, 32-7
  98. pxor xmm0, [rsp + 4*16 + 1*256]
  99. pxor xmm4, [rsp + 5*16 + 1*256]
  100. pxor xmm8, [rsp + 6*16 + 1*256]
  101. pxor xmm12, [rsp + 7*16 + 1*256]
  102. pxor xmm0, xmm1
  103. pxor xmm4, xmm5
  104. pxor xmm8, xmm9
  105. pxor xmm12, xmm13
  106. movdqa [rsp + 4*16], xmm0
  107. movdqa [rsp + 5*16], xmm4
  108. movdqa [rsp + 6*16], xmm8
  109. movdqa [rsp + 7*16], xmm12
  110. movdqa xmm1, xmm0
  111. movdqa xmm5, xmm4
  112. movdqa xmm9, xmm8
  113. movdqa xmm13, xmm12
  114. paddd xmm0, xmm2
  115. paddd xmm4, xmm6
  116. paddd xmm8, xmm10
  117. paddd xmm12, xmm14
  118. movdqa xmm3, xmm0
  119. movdqa xmm7, xmm4
  120. movdqa xmm11, xmm8
  121. movdqa xmm15, xmm12
  122. pslld xmm0, 9
  123. pslld xmm4, 9
  124. pslld xmm8, 9
  125. pslld xmm12, 9
  126. psrld xmm3, 32-9
  127. psrld xmm7, 32-9
  128. psrld xmm11, 32-9
  129. psrld xmm15, 32-9
  130. pxor xmm0, [rsp + 8*16 + 1*256]
  131. pxor xmm4, [rsp + 9*16 + 1*256]
  132. pxor xmm8, [rsp + 10*16 + 1*256]
  133. pxor xmm12, [rsp + 11*16 + 1*256]
  134. pxor xmm0, xmm3
  135. pxor xmm4, xmm7
  136. pxor xmm8, xmm11
  137. pxor xmm12, xmm15
  138. movdqa [rsp + 8*16], xmm0
  139. movdqa [rsp + 9*16], xmm4
  140. movdqa [rsp + 10*16], xmm8
  141. movdqa [rsp + 11*16], xmm12
  142. movdqa xmm3, xmm0
  143. movdqa xmm7, xmm4
  144. movdqa xmm11, xmm8
  145. movdqa xmm15, xmm12
  146. paddd xmm0, xmm1
  147. paddd xmm4, xmm5
  148. paddd xmm8, xmm9
  149. paddd xmm12, xmm13
  150. movdqa xmm1, xmm0
  151. movdqa xmm5, xmm4
  152. movdqa xmm9, xmm8
  153. movdqa xmm13, xmm12
  154. pslld xmm0, 13
  155. pslld xmm4, 13
  156. pslld xmm8, 13
  157. pslld xmm12, 13
  158. psrld xmm1, 32-13
  159. psrld xmm5, 32-13
  160. psrld xmm9, 32-13
  161. psrld xmm13, 32-13
  162. pxor xmm0, [rsp + 12*16 + 1*256]
  163. pxor xmm4, [rsp + 13*16 + 1*256]
  164. pxor xmm8, [rsp + 14*16 + 1*256]
  165. pxor xmm12, [rsp + 15*16 + 1*256]
  166. pxor xmm0, xmm1
  167. pxor xmm4, xmm5
  168. pxor xmm8, xmm9
  169. pxor xmm12, xmm13
  170. movdqa [rsp + 12*16], xmm0
  171. movdqa [rsp + 13*16], xmm4
  172. movdqa [rsp + 14*16], xmm8
  173. movdqa [rsp + 15*16], xmm12
  174. paddd xmm0, xmm3
  175. paddd xmm4, xmm7
  176. paddd xmm8, xmm11
  177. paddd xmm12, xmm15
  178. movdqa xmm3, xmm0
  179. movdqa xmm7, xmm4
  180. movdqa xmm11, xmm8
  181. movdqa xmm15, xmm12
  182. pslld xmm0, 18
  183. pslld xmm4, 18
  184. pslld xmm8, 18
  185. pslld xmm12, 18
  186. psrld xmm3, 32-18
  187. psrld xmm7, 32-18
  188. psrld xmm11, 32-18
  189. psrld xmm15, 32-18
  190. pxor xmm0, xmm2
  191. pxor xmm4, xmm6
  192. pxor xmm8, xmm10
  193. pxor xmm12, xmm14
  194. pxor xmm0, xmm3
  195. pxor xmm4, xmm7
  196. pxor xmm8, xmm11
  197. pxor xmm12, xmm15
  198. movdqa [rsp + 0*16], xmm0
  199. movdqa [rsp + 1*16], xmm4
  200. movdqa [rsp + 2*16], xmm8
  201. movdqa [rsp + 3*16], xmm12
  202. mov rax, r9
  203. jmp label2
  204. labelSSE2_Salsa_Output:
  205. movdqa xmm0, xmm4
  206. punpckldq xmm4, xmm5
  207. movdqa xmm1, xmm6
  208. punpckldq xmm6, xmm7
  209. movdqa xmm2, xmm4
  210. punpcklqdq xmm4, xmm6
  211. punpckhqdq xmm2, xmm6
  212. punpckhdq xmm0, xmm5
  213. punpckhdq xmm1, xmm7
  214. movdqa xmm6, xmm0
  215. punpcklqdq xmm0, xmm1
  216. punpckhqdq xmm6, xmm1
  217. test rdx, rdx
  218. jz labelSSE2_Salsa_Output_A3
  219. test rdx, 15
  220. jnz labelSSE2_Salsa_Output_A7
  221. pxor xmm4, [rdx+0*16]
  222. pxor xmm2, [rdx+4*16]
  223. pxor xmm0, [rdx+8*16]
  224. pxor xmm6, [rdx+12*16]
  225. add rdx, 1*16
  226. jmp labelSSE2_Salsa_Output_A3
  227. labelSSE2_Salsa_Output_A7:
  228. movdqu xmm1, [rdx+0*16]
  229. pxor xmm4, xmm1
  230. movdqu xmm1, [rdx+4*16]
  231. pxor xmm2, xmm1
  232. movdqu xmm1, [rdx+8*16]
  233. pxor xmm0, xmm1
  234. movdqu xmm1, [rdx+12*16]
  235. pxor xmm6, xmm1
  236. add rdx, 1*16
  237. labelSSE2_Salsa_Output_A3:
  238. test rcx, 15
  239. jnz labelSSE2_Salsa_Output_A8
  240. movdqa [rcx+0*16], xmm4
  241. movdqa [rcx+4*16], xmm2
  242. movdqa [rcx+8*16], xmm0
  243. movdqa [rcx+12*16], xmm6
  244. jmp labelSSE2_Salsa_Output_A9
  245. labelSSE2_Salsa_Output_A8:
  246. movdqu [rcx+0*16], xmm4
  247. movdqu [rcx+4*16], xmm2
  248. movdqu [rcx+8*16], xmm0
  249. movdqu [rcx+12*16], xmm6
  250. labelSSE2_Salsa_Output_A9:
  251. add rcx, 1*16
  252. ret
  253. label6:
  254. movdqa xmm0, [rsp + 12*16 + 0*256]
  255. movdqa xmm4, [rsp + 13*16 + 0*256]
  256. movdqa xmm8, [rsp + 14*16 + 0*256]
  257. movdqa xmm12, [rsp + 15*16 + 0*256]
  258. movdqa xmm2, [rsp + 0*16 + 0*256]
  259. movdqa xmm6, [rsp + 1*16 + 0*256]
  260. movdqa xmm10, [rsp + 2*16 + 0*256]
  261. movdqa xmm14, [rsp + 3*16 + 0*256]
  262. paddd xmm0, xmm2
  263. paddd xmm4, xmm6
  264. paddd xmm8, xmm10
  265. paddd xmm12, xmm14
  266. movdqa xmm1, xmm0
  267. movdqa xmm5, xmm4
  268. movdqa xmm9, xmm8
  269. movdqa xmm13, xmm12
  270. pslld xmm0, 7
  271. pslld xmm4, 7
  272. pslld xmm8, 7
  273. pslld xmm12, 7
  274. psrld xmm1, 32-7
  275. psrld xmm5, 32-7
  276. psrld xmm9, 32-7
  277. psrld xmm13, 32-7
  278. pxor xmm0, [rsp + 4*16 + 0*256]
  279. pxor xmm4, [rsp + 5*16 + 0*256]
  280. pxor xmm8, [rsp + 6*16 + 0*256]
  281. pxor xmm12, [rsp + 7*16 + 0*256]
  282. pxor xmm0, xmm1
  283. pxor xmm4, xmm5
  284. pxor xmm8, xmm9
  285. pxor xmm12, xmm13
  286. movdqa [rsp + 4*16], xmm0
  287. movdqa [rsp + 5*16], xmm4
  288. movdqa [rsp + 6*16], xmm8
  289. movdqa [rsp + 7*16], xmm12
  290. movdqa xmm1, xmm0
  291. movdqa xmm5, xmm4
  292. movdqa xmm9, xmm8
  293. movdqa xmm13, xmm12
  294. paddd xmm0, xmm2
  295. paddd xmm4, xmm6
  296. paddd xmm8, xmm10
  297. paddd xmm12, xmm14
  298. movdqa xmm3, xmm0
  299. movdqa xmm7, xmm4
  300. movdqa xmm11, xmm8
  301. movdqa xmm15, xmm12
  302. pslld xmm0, 9
  303. pslld xmm4, 9
  304. pslld xmm8, 9
  305. pslld xmm12, 9
  306. psrld xmm3, 32-9
  307. psrld xmm7, 32-9
  308. psrld xmm11, 32-9
  309. psrld xmm15, 32-9
  310. pxor xmm0, [rsp + 8*16 + 0*256]
  311. pxor xmm4, [rsp + 9*16 + 0*256]
  312. pxor xmm8, [rsp + 10*16 + 0*256]
  313. pxor xmm12, [rsp + 11*16 + 0*256]
  314. pxor xmm0, xmm3
  315. pxor xmm4, xmm7
  316. pxor xmm8, xmm11
  317. pxor xmm12, xmm15
  318. movdqa [rsp + 8*16], xmm0
  319. movdqa [rsp + 9*16], xmm4
  320. movdqa [rsp + 10*16], xmm8
  321. movdqa [rsp + 11*16], xmm12
  322. movdqa xmm3, xmm0
  323. movdqa xmm7, xmm4
  324. movdqa xmm11, xmm8
  325. movdqa xmm15, xmm12
  326. paddd xmm0, xmm1
  327. paddd xmm4, xmm5
  328. paddd xmm8, xmm9
  329. paddd xmm12, xmm13
  330. movdqa xmm1, xmm0
  331. movdqa xmm5, xmm4
  332. movdqa xmm9, xmm8
  333. movdqa xmm13, xmm12
  334. pslld xmm0, 13
  335. pslld xmm4, 13
  336. pslld xmm8, 13
  337. pslld xmm12, 13
  338. psrld xmm1, 32-13
  339. psrld xmm5, 32-13
  340. psrld xmm9, 32-13
  341. psrld xmm13, 32-13
  342. pxor xmm0, [rsp + 12*16 + 0*256]
  343. pxor xmm4, [rsp + 13*16 + 0*256]
  344. pxor xmm8, [rsp + 14*16 + 0*256]
  345. pxor xmm12, [rsp + 15*16 + 0*256]
  346. pxor xmm0, xmm1
  347. pxor xmm4, xmm5
  348. pxor xmm8, xmm9
  349. pxor xmm12, xmm13
  350. movdqa [rsp + 12*16], xmm0
  351. movdqa [rsp + 13*16], xmm4
  352. movdqa [rsp + 14*16], xmm8
  353. movdqa [rsp + 15*16], xmm12
  354. paddd xmm0, xmm3
  355. paddd xmm4, xmm7
  356. paddd xmm8, xmm11
  357. paddd xmm12, xmm15
  358. movdqa xmm3, xmm0
  359. movdqa xmm7, xmm4
  360. movdqa xmm11, xmm8
  361. movdqa xmm15, xmm12
  362. pslld xmm0, 18
  363. pslld xmm4, 18
  364. pslld xmm8, 18
  365. pslld xmm12, 18
  366. psrld xmm3, 32-18
  367. psrld xmm7, 32-18
  368. psrld xmm11, 32-18
  369. psrld xmm15, 32-18
  370. pxor xmm0, xmm2
  371. pxor xmm4, xmm6
  372. pxor xmm8, xmm10
  373. pxor xmm12, xmm14
  374. pxor xmm0, xmm3
  375. pxor xmm4, xmm7
  376. pxor xmm8, xmm11
  377. pxor xmm12, xmm15
  378. movdqa [rsp + 0*16], xmm0
  379. movdqa [rsp + 1*16], xmm4
  380. movdqa [rsp + 2*16], xmm8
  381. movdqa [rsp + 3*16], xmm12
  382. label2:
  383. movdqa xmm0, [rsp + 7*16 + 0*256]
  384. movdqa xmm4, [rsp + 4*16 + 0*256]
  385. movdqa xmm8, [rsp + 5*16 + 0*256]
  386. movdqa xmm12, [rsp + 6*16 + 0*256]
  387. movdqa xmm2, [rsp + 0*16 + 0*256]
  388. movdqa xmm6, [rsp + 1*16 + 0*256]
  389. movdqa xmm10, [rsp + 2*16 + 0*256]
  390. movdqa xmm14, [rsp + 3*16 + 0*256]
  391. paddd xmm0, xmm2
  392. paddd xmm4, xmm6
  393. paddd xmm8, xmm10
  394. paddd xmm12, xmm14
  395. movdqa xmm1, xmm0
  396. movdqa xmm5, xmm4
  397. movdqa xmm9, xmm8
  398. movdqa xmm13, xmm12
  399. pslld xmm0, 7
  400. pslld xmm4, 7
  401. pslld xmm8, 7
  402. pslld xmm12, 7
  403. psrld xmm1, 32-7
  404. psrld xmm5, 32-7
  405. psrld xmm9, 32-7
  406. psrld xmm13, 32-7
  407. pxor xmm0, [rsp + 13*16 + 0*256]
  408. pxor xmm4, [rsp + 14*16 + 0*256]
  409. pxor xmm8, [rsp + 15*16 + 0*256]
  410. pxor xmm12, [rsp + 12*16 + 0*256]
  411. pxor xmm0, xmm1
  412. pxor xmm4, xmm5
  413. pxor xmm8, xmm9
  414. pxor xmm12, xmm13
  415. movdqa [rsp + 13*16], xmm0
  416. movdqa [rsp + 14*16], xmm4
  417. movdqa [rsp + 15*16], xmm8
  418. movdqa [rsp + 12*16], xmm12
  419. movdqa xmm1, xmm0
  420. movdqa xmm5, xmm4
  421. movdqa xmm9, xmm8
  422. movdqa xmm13, xmm12
  423. paddd xmm0, xmm2
  424. paddd xmm4, xmm6
  425. paddd xmm8, xmm10
  426. paddd xmm12, xmm14
  427. movdqa xmm3, xmm0
  428. movdqa xmm7, xmm4
  429. movdqa xmm11, xmm8
  430. movdqa xmm15, xmm12
  431. pslld xmm0, 9
  432. pslld xmm4, 9
  433. pslld xmm8, 9
  434. pslld xmm12, 9
  435. psrld xmm3, 32-9
  436. psrld xmm7, 32-9
  437. psrld xmm11, 32-9
  438. psrld xmm15, 32-9
  439. pxor xmm0, [rsp + 10*16 + 0*256]
  440. pxor xmm4, [rsp + 11*16 + 0*256]
  441. pxor xmm8, [rsp + 8*16 + 0*256]
  442. pxor xmm12, [rsp + 9*16 + 0*256]
  443. pxor xmm0, xmm3
  444. pxor xmm4, xmm7
  445. pxor xmm8, xmm11
  446. pxor xmm12, xmm15
  447. movdqa [rsp + 10*16], xmm0
  448. movdqa [rsp + 11*16], xmm4
  449. movdqa [rsp + 8*16], xmm8
  450. movdqa [rsp + 9*16], xmm12
  451. movdqa xmm3, xmm0
  452. movdqa xmm7, xmm4
  453. movdqa xmm11, xmm8
  454. movdqa xmm15, xmm12
  455. paddd xmm0, xmm1
  456. paddd xmm4, xmm5
  457. paddd xmm8, xmm9
  458. paddd xmm12, xmm13
  459. movdqa xmm1, xmm0
  460. movdqa xmm5, xmm4
  461. movdqa xmm9, xmm8
  462. movdqa xmm13, xmm12
  463. pslld xmm0, 13
  464. pslld xmm4, 13
  465. pslld xmm8, 13
  466. pslld xmm12, 13
  467. psrld xmm1, 32-13
  468. psrld xmm5, 32-13
  469. psrld xmm9, 32-13
  470. psrld xmm13, 32-13
  471. pxor xmm0, [rsp + 7*16 + 0*256]
  472. pxor xmm4, [rsp + 4*16 + 0*256]
  473. pxor xmm8, [rsp + 5*16 + 0*256]
  474. pxor xmm12, [rsp + 6*16 + 0*256]
  475. pxor xmm0, xmm1
  476. pxor xmm4, xmm5
  477. pxor xmm8, xmm9
  478. pxor xmm12, xmm13
  479. movdqa [rsp + 7*16], xmm0
  480. movdqa [rsp + 4*16], xmm4
  481. movdqa [rsp + 5*16], xmm8
  482. movdqa [rsp + 6*16], xmm12
  483. paddd xmm0, xmm3
  484. paddd xmm4, xmm7
  485. paddd xmm8, xmm11
  486. paddd xmm12, xmm15
  487. movdqa xmm3, xmm0
  488. movdqa xmm7, xmm4
  489. movdqa xmm11, xmm8
  490. movdqa xmm15, xmm12
  491. pslld xmm0, 18
  492. pslld xmm4, 18
  493. pslld xmm8, 18
  494. pslld xmm12, 18
  495. psrld xmm3, 32-18
  496. psrld xmm7, 32-18
  497. psrld xmm11, 32-18
  498. psrld xmm15, 32-18
  499. pxor xmm0, xmm2
  500. pxor xmm4, xmm6
  501. pxor xmm8, xmm10
  502. pxor xmm12, xmm14
  503. pxor xmm0, xmm3
  504. pxor xmm4, xmm7
  505. pxor xmm8, xmm11
  506. pxor xmm12, xmm15
  507. movdqa [rsp + 0*16], xmm0
  508. movdqa [rsp + 1*16], xmm4
  509. movdqa [rsp + 2*16], xmm8
  510. movdqa [rsp + 3*16], xmm12
  511. sub eax, 2
  512. jnz label6
  513. movdqa xmm4, [rsp + 0*16 + 256]
  514. paddd xmm4, [rsp + 0*16]
  515. movdqa xmm5, [rsp + 13*16 + 256]
  516. paddd xmm5, [rsp + 13*16]
  517. movdqa xmm6, [rsp + 10*16 + 256]
  518. paddd xmm6, [rsp + 10*16]
  519. movdqa xmm7, [rsp + 7*16 + 256]
  520. paddd xmm7, [rsp + 7*16]
  521. call labelSSE2_Salsa_Output
  522. movdqa xmm4, [rsp + 4*16 + 256]
  523. paddd xmm4, [rsp + 4*16]
  524. movdqa xmm5, [rsp + 1*16 + 256]
  525. paddd xmm5, [rsp + 1*16]
  526. movdqa xmm6, [rsp + 14*16 + 256]
  527. paddd xmm6, [rsp + 14*16]
  528. movdqa xmm7, [rsp + 11*16 + 256]
  529. paddd xmm7, [rsp + 11*16]
  530. call labelSSE2_Salsa_Output
  531. movdqa xmm4, [rsp + 8*16 + 256]
  532. paddd xmm4, [rsp + 8*16]
  533. movdqa xmm5, [rsp + 5*16 + 256]
  534. paddd xmm5, [rsp + 5*16]
  535. movdqa xmm6, [rsp + 2*16 + 256]
  536. paddd xmm6, [rsp + 2*16]
  537. movdqa xmm7, [rsp + 15*16 + 256]
  538. paddd xmm7, [rsp + 15*16]
  539. call labelSSE2_Salsa_Output
  540. movdqa xmm4, [rsp + 12*16 + 256]
  541. paddd xmm4, [rsp + 12*16]
  542. movdqa xmm5, [rsp + 9*16 + 256]
  543. paddd xmm5, [rsp + 9*16]
  544. movdqa xmm6, [rsp + 6*16 + 256]
  545. paddd xmm6, [rsp + 6*16]
  546. movdqa xmm7, [rsp + 3*16 + 256]
  547. paddd xmm7, [rsp + 3*16]
  548. call labelSSE2_Salsa_Output
  549. test rdx, rdx
  550. jz label9
  551. add rdx, 12*16
  552. label9:
  553. add rcx, 12*16
  554. sub r8, 4
  555. cmp r8, 4
  556. jge label1
  557. label5:
  558. sub r8, 1
  559. jl label4
  560. movdqa xmm0, [r10 + 0*16]
  561. movdqa xmm1, [r10 + 1*16]
  562. movdqa xmm2, [r10 + 2*16]
  563. movdqa xmm3, [r10 + 3*16]
  564. mov rax, r9
  565. label0:
  566. movdqa xmm4, xmm3
  567. paddd xmm4, xmm0
  568. movdqa xmm5, xmm4
  569. pslld xmm4, 7
  570. psrld xmm5, 32-7
  571. pxor xmm1, xmm4
  572. pxor xmm1, xmm5
  573. movdqa xmm4, xmm0
  574. paddd xmm4, xmm1
  575. movdqa xmm5, xmm4
  576. pslld xmm4, 9
  577. psrld xmm5, 32-9
  578. pxor xmm2, xmm4
  579. pxor xmm2, xmm5
  580. movdqa xmm4, xmm1
  581. paddd xmm4, xmm2
  582. movdqa xmm5, xmm4
  583. pslld xmm4, 13
  584. psrld xmm5, 32-13
  585. pxor xmm3, xmm4
  586. pxor xmm3, xmm5
  587. movdqa xmm4, xmm2
  588. paddd xmm4, xmm3
  589. movdqa xmm5, xmm4
  590. pslld xmm4, 18
  591. psrld xmm5, 32-18
  592. pxor xmm0, xmm4
  593. pxor xmm0, xmm5
  594. pshufd xmm1, xmm1, 2*64+1*16+0*4+3
  595. pshufd xmm2, xmm2, 1*64+0*16+3*4+2
  596. pshufd xmm3, xmm3, 0*64+3*16+2*4+1
  597. movdqa xmm4, xmm1
  598. paddd xmm4, xmm0
  599. movdqa xmm5, xmm4
  600. pslld xmm4, 7
  601. psrld xmm5, 32-7
  602. pxor xmm3, xmm4
  603. pxor xmm3, xmm5
  604. movdqa xmm4, xmm0
  605. paddd xmm4, xmm3
  606. movdqa xmm5, xmm4
  607. pslld xmm4, 9
  608. psrld xmm5, 32-9
  609. pxor xmm2, xmm4
  610. pxor xmm2, xmm5
  611. movdqa xmm4, xmm3
  612. paddd xmm4, xmm2
  613. movdqa xmm5, xmm4
  614. pslld xmm4, 13
  615. psrld xmm5, 32-13
  616. pxor xmm1, xmm4
  617. pxor xmm1, xmm5
  618. movdqa xmm4, xmm2
  619. paddd xmm4, xmm1
  620. movdqa xmm5, xmm4
  621. pslld xmm4, 18
  622. psrld xmm5, 32-18
  623. pxor xmm0, xmm4
  624. pxor xmm0, xmm5
  625. pshufd xmm1, xmm1, 0*64+3*16+2*4+1
  626. pshufd xmm2, xmm2, 1*64+0*16+3*4+2
  627. pshufd xmm3, xmm3, 2*64+1*16+0*4+3
  628. sub eax, 2
  629. jnz label0
  630. paddd xmm0, [r10 + 0*16]
  631. paddd xmm1, [r10 + 1*16]
  632. paddd xmm2, [r10 + 2*16]
  633. paddd xmm3, [r10 + 3*16]
  634. add dword ptr [r10 + 8*4], 1
  635. adc dword ptr [r10 + 5*4], 0
  636. pcmpeqb xmm6, xmm6
  637. psrlq xmm6, 32
  638. pshufd xmm7, xmm6, 0*64+1*16+2*4+3
  639. movdqa xmm4, xmm0
  640. movdqa xmm5, xmm3
  641. pand xmm0, xmm7
  642. pand xmm4, xmm6
  643. pand xmm3, xmm6
  644. pand xmm5, xmm7
  645. por xmm4, xmm5
  646. movdqa xmm5, xmm1
  647. pand xmm1, xmm7
  648. pand xmm5, xmm6
  649. por xmm0, xmm5
  650. pand xmm6, xmm2
  651. pand xmm2, xmm7
  652. por xmm1, xmm6
  653. por xmm2, xmm3
  654. movdqa xmm5, xmm4
  655. movdqa xmm6, xmm0
  656. shufpd xmm4, xmm1, 2
  657. shufpd xmm0, xmm2, 2
  658. shufpd xmm1, xmm5, 2
  659. shufpd xmm2, xmm6, 2
  660. test rdx, rdx
  661. jz labelSSE2_Salsa_Output_B3
  662. test rdx, 15
  663. jnz labelSSE2_Salsa_Output_B7
  664. pxor xmm4, [rdx+0*16]
  665. pxor xmm0, [rdx+1*16]
  666. pxor xmm1, [rdx+2*16]
  667. pxor xmm2, [rdx+3*16]
  668. add rdx, 4*16
  669. jmp labelSSE2_Salsa_Output_B3
  670. labelSSE2_Salsa_Output_B7:
  671. movdqu xmm3, [rdx+0*16]
  672. pxor xmm4, xmm3
  673. movdqu xmm3, [rdx+1*16]
  674. pxor xmm0, xmm3
  675. movdqu xmm3, [rdx+2*16]
  676. pxor xmm1, xmm3
  677. movdqu xmm3, [rdx+3*16]
  678. pxor xmm2, xmm3
  679. add rdx, 4*16
  680. labelSSE2_Salsa_Output_B3:
  681. test rcx, 15
  682. jnz labelSSE2_Salsa_Output_B8
  683. movdqa [rcx+0*16], xmm4
  684. movdqa [rcx+1*16], xmm0
  685. movdqa [rcx+2*16], xmm1
  686. movdqa [rcx+3*16], xmm2
  687. jmp labelSSE2_Salsa_Output_B9
  688. labelSSE2_Salsa_Output_B8:
  689. movdqu [rcx+0*16], xmm4
  690. movdqu [rcx+1*16], xmm0
  691. movdqu [rcx+2*16], xmm1
  692. movdqu [rcx+3*16], xmm2
  693. labelSSE2_Salsa_Output_B9:
  694. add rcx, 4*16
  695. jmp label5
  696. label4:
  697. movdqa xmm6, [rsp + 0200h]
  698. movdqa xmm7, [rsp + 0210h]
  699. movdqa xmm8, [rsp + 0220h]
  700. movdqa xmm9, [rsp + 0230h]
  701. movdqa xmm10, [rsp + 0240h]
  702. movdqa xmm11, [rsp + 0250h]
  703. movdqa xmm12, [rsp + 0260h]
  704. movdqa xmm13, [rsp + 0270h]
  705. movdqa xmm14, [rsp + 0280h]
  706. movdqa xmm15, [rsp + 0290h]
  707. add rsp, 10*16 + 32*16 + 8
  708. ret
  709. Salsa20_OperateKeystream ENDP
  710. ALIGN 8
  711. Sosemanuk_OperateKeystream PROC FRAME
  712. rex_push_reg rsi
  713. push_reg rdi
  714. alloc_stack(80*4*2+12*4+8*8 + 2*16+8)
  715. save_xmm128 xmm6, 02f0h
  716. save_xmm128 xmm7, 0300h
  717. .endprolog
  718. mov rdi, r8
  719. mov rax, r9
  720. mov QWORD PTR [rsp+1*8], rdi
  721. mov QWORD PTR [rsp+2*8], rdx
  722. mov QWORD PTR [rsp+6*8], rax
  723. lea rcx, [4*rcx+rcx]
  724. lea rsi, [4*rcx]
  725. mov QWORD PTR [rsp+3*8], rsi
  726. movdqa xmm0, [rax+0*16]
  727. movdqa [rsp + 8*8+0*16], xmm0
  728. movdqa xmm0, [rax+1*16]
  729. movdqa [rsp + 8*8+1*16], xmm0
  730. movq xmm0, QWORD PTR [rax+2*16]
  731. movq QWORD PTR [rsp + 8*8+2*16], xmm0
  732. psrlq xmm0, 32
  733. movd r10d, xmm0
  734. mov ecx, [rax+10*4]
  735. mov edx, [rax+11*4]
  736. pcmpeqb xmm7, xmm7
  737. label2:
  738. lea rdi, [rsp + 8*8 + 12*4]
  739. mov rax, 80
  740. cmp rsi, 80
  741. cmovg rsi, rax
  742. mov QWORD PTR [rsp+7*8], rsi
  743. lea rsi, [rdi+rsi]
  744. mov QWORD PTR [rsp+4*8], rsi
  745. lea rsi, s_sosemanukMulTables
  746. label0:
  747. mov eax, [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4]
  748. mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4 + 80*4], eax
  749. rol eax, 8
  750. lea r11d, [r10d + edx]
  751. xor r11d, ecx
  752. mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4], r11d
  753. mov r11d, 1
  754. and r11d, edx
  755. neg r11d
  756. and r11d, r10d
  757. xor r10d, eax
  758. movzx eax, al
  759. xor r10d, [rsi+rax*4]
  760. mov eax, [rsp + 8*8 + ((0+3)-((0+3)/(10))*(10))*4]
  761. xor r11d, [rsp + 8*8 + ((0+2)-((0+2)/(10))*(10))*4]
  762. add ecx, r11d
  763. movzx r11d, al
  764. shr eax, 8
  765. xor r10d, [rsi+1024+r11*4]
  766. xor r10d, eax
  767. imul edx, 54655307h
  768. rol edx, 7
  769. mov [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4], r10d
  770. mov eax, [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4]
  771. mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4 + 80*4], eax
  772. rol eax, 8
  773. lea r11d, [r10d + ecx]
  774. xor r11d, edx
  775. mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4], r11d
  776. mov r11d, 1
  777. and r11d, ecx
  778. neg r11d
  779. and r11d, r10d
  780. xor r10d, eax
  781. movzx eax, al
  782. xor r10d, [rsi+rax*4]
  783. mov eax, [rsp + 8*8 + ((1+3)-((1+3)/(10))*(10))*4]
  784. xor r11d, [rsp + 8*8 + ((1+2)-((1+2)/(10))*(10))*4]
  785. add edx, r11d
  786. movzx r11d, al
  787. shr eax, 8
  788. xor r10d, [rsi+1024+r11*4]
  789. xor r10d, eax
  790. imul ecx, 54655307h
  791. rol ecx, 7
  792. mov [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4], r10d
  793. mov eax, [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4]
  794. mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4 + 80*4], eax
  795. rol eax, 8
  796. lea r11d, [r10d + edx]
  797. xor r11d, ecx
  798. mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4], r11d
  799. mov r11d, 1
  800. and r11d, edx
  801. neg r11d
  802. and r11d, r10d
  803. xor r10d, eax
  804. movzx eax, al
  805. xor r10d, [rsi+rax*4]
  806. mov eax, [rsp + 8*8 + ((2+3)-((2+3)/(10))*(10))*4]
  807. xor r11d, [rsp + 8*8 + ((2+2)-((2+2)/(10))*(10))*4]
  808. add ecx, r11d
  809. movzx r11d, al
  810. shr eax, 8
  811. xor r10d, [rsi+1024+r11*4]
  812. xor r10d, eax
  813. imul edx, 54655307h
  814. rol edx, 7
  815. mov [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4], r10d
  816. mov eax, [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4]
  817. mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4 + 80*4], eax
  818. rol eax, 8
  819. lea r11d, [r10d + ecx]
  820. xor r11d, edx
  821. mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4], r11d
  822. mov r11d, 1
  823. and r11d, ecx
  824. neg r11d
  825. and r11d, r10d
  826. xor r10d, eax
  827. movzx eax, al
  828. xor r10d, [rsi+rax*4]
  829. mov eax, [rsp + 8*8 + ((3+3)-((3+3)/(10))*(10))*4]
  830. xor r11d, [rsp + 8*8 + ((3+2)-((3+2)/(10))*(10))*4]
  831. add edx, r11d
  832. movzx r11d, al
  833. shr eax, 8
  834. xor r10d, [rsi+1024+r11*4]
  835. xor r10d, eax
  836. imul ecx, 54655307h
  837. rol ecx, 7
  838. mov [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4], r10d
  839. mov eax, [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4]
  840. mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4 + 80*4], eax
  841. rol eax, 8
  842. lea r11d, [r10d + edx]
  843. xor r11d, ecx
  844. mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4], r11d
  845. mov r11d, 1
  846. and r11d, edx
  847. neg r11d
  848. and r11d, r10d
  849. xor r10d, eax
  850. movzx eax, al
  851. xor r10d, [rsi+rax*4]
  852. mov eax, [rsp + 8*8 + ((4+3)-((4+3)/(10))*(10))*4]
  853. xor r11d, [rsp + 8*8 + ((4+2)-((4+2)/(10))*(10))*4]
  854. add ecx, r11d
  855. movzx r11d, al
  856. shr eax, 8
  857. xor r10d, [rsi+1024+r11*4]
  858. xor r10d, eax
  859. imul edx, 54655307h
  860. rol edx, 7
  861. mov [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4], r10d
  862. mov eax, [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4]
  863. mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4 + 80*4], eax
  864. rol eax, 8
  865. lea r11d, [r10d + ecx]
  866. xor r11d, edx
  867. mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4], r11d
  868. mov r11d, 1
  869. and r11d, ecx
  870. neg r11d
  871. and r11d, r10d
  872. xor r10d, eax
  873. movzx eax, al
  874. xor r10d, [rsi+rax*4]
  875. mov eax, [rsp + 8*8 + ((5+3)-((5+3)/(10))*(10))*4]
  876. xor r11d, [rsp + 8*8 + ((5+2)-((5+2)/(10))*(10))*4]
  877. add edx, r11d
  878. movzx r11d, al
  879. shr eax, 8
  880. xor r10d, [rsi+1024+r11*4]
  881. xor r10d, eax
  882. imul ecx, 54655307h
  883. rol ecx, 7
  884. mov [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4], r10d
  885. mov eax, [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4]
  886. mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4 + 80*4], eax
  887. rol eax, 8
  888. lea r11d, [r10d + edx]
  889. xor r11d, ecx
  890. mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4], r11d
  891. mov r11d, 1
  892. and r11d, edx
  893. neg r11d
  894. and r11d, r10d
  895. xor r10d, eax
  896. movzx eax, al
  897. xor r10d, [rsi+rax*4]
  898. mov eax, [rsp + 8*8 + ((6+3)-((6+3)/(10))*(10))*4]
  899. xor r11d, [rsp + 8*8 + ((6+2)-((6+2)/(10))*(10))*4]
  900. add ecx, r11d
  901. movzx r11d, al
  902. shr eax, 8
  903. xor r10d, [rsi+1024+r11*4]
  904. xor r10d, eax
  905. imul edx, 54655307h
  906. rol edx, 7
  907. mov [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4], r10d
  908. mov eax, [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4]
  909. mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4 + 80*4], eax
  910. rol eax, 8
  911. lea r11d, [r10d + ecx]
  912. xor r11d, edx
  913. mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4], r11d
  914. mov r11d, 1
  915. and r11d, ecx
  916. neg r11d
  917. and r11d, r10d
  918. xor r10d, eax
  919. movzx eax, al
  920. xor r10d, [rsi+rax*4]
  921. mov eax, [rsp + 8*8 + ((7+3)-((7+3)/(10))*(10))*4]
  922. xor r11d, [rsp + 8*8 + ((7+2)-((7+2)/(10))*(10))*4]
  923. add edx, r11d
  924. movzx r11d, al
  925. shr eax, 8
  926. xor r10d, [rsi+1024+r11*4]
  927. xor r10d, eax
  928. imul ecx, 54655307h
  929. rol ecx, 7
  930. mov [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4], r10d
  931. mov eax, [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4]
  932. mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4 + 80*4], eax
  933. rol eax, 8
  934. lea r11d, [r10d + edx]
  935. xor r11d, ecx
  936. mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4], r11d
  937. mov r11d, 1
  938. and r11d, edx
  939. neg r11d
  940. and r11d, r10d
  941. xor r10d, eax
  942. movzx eax, al
  943. xor r10d, [rsi+rax*4]
  944. mov eax, [rsp + 8*8 + ((8+3)-((8+3)/(10))*(10))*4]
  945. xor r11d, [rsp + 8*8 + ((8+2)-((8+2)/(10))*(10))*4]
  946. add ecx, r11d
  947. movzx r11d, al
  948. shr eax, 8
  949. xor r10d, [rsi+1024+r11*4]
  950. xor r10d, eax
  951. imul edx, 54655307h
  952. rol edx, 7
  953. mov [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4], r10d
  954. mov eax, [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4]
  955. mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4 + 80*4], eax
  956. rol eax, 8
  957. lea r11d, [r10d + ecx]
  958. xor r11d, edx
  959. mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4], r11d
  960. mov r11d, 1
  961. and r11d, ecx
  962. neg r11d
  963. and r11d, r10d
  964. xor r10d, eax
  965. movzx eax, al
  966. xor r10d, [rsi+rax*4]
  967. mov eax, [rsp + 8*8 + ((9+3)-((9+3)/(10))*(10))*4]
  968. xor r11d, [rsp + 8*8 + ((9+2)-((9+2)/(10))*(10))*4]
  969. add edx, r11d
  970. movzx r11d, al
  971. shr eax, 8
  972. xor r10d, [rsi+1024+r11*4]
  973. xor r10d, eax
  974. imul ecx, 54655307h
  975. rol ecx, 7
  976. mov [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4], r10d
  977. mov eax, [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4]
  978. mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4 + 80*4], eax
  979. rol eax, 8
  980. lea r11d, [r10d + edx]
  981. xor r11d, ecx
  982. mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4], r11d
  983. mov r11d, 1
  984. and r11d, edx
  985. neg r11d
  986. and r11d, r10d
  987. xor r10d, eax
  988. movzx eax, al
  989. xor r10d, [rsi+rax*4]
  990. mov eax, [rsp + 8*8 + ((10+3)-((10+3)/(10))*(10))*4]
  991. xor r11d, [rsp + 8*8 + ((10+2)-((10+2)/(10))*(10))*4]
  992. add ecx, r11d
  993. movzx r11d, al
  994. shr eax, 8
  995. xor r10d, [rsi+1024+r11*4]
  996. xor r10d, eax
  997. imul edx, 54655307h
  998. rol edx, 7
  999. mov [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4], r10d
  1000. mov eax, [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4]
  1001. mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4 + 80*4], eax
  1002. rol eax, 8
  1003. lea r11d, [r10d + ecx]
  1004. xor r11d, edx
  1005. mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4], r11d
  1006. mov r11d, 1
  1007. and r11d, ecx
  1008. neg r11d
  1009. and r11d, r10d
  1010. xor r10d, eax
  1011. movzx eax, al
  1012. xor r10d, [rsi+rax*4]
  1013. mov eax, [rsp + 8*8 + ((11+3)-((11+3)/(10))*(10))*4]
  1014. xor r11d, [rsp + 8*8 + ((11+2)-((11+2)/(10))*(10))*4]
  1015. add edx, r11d
  1016. movzx r11d, al
  1017. shr eax, 8
  1018. xor r10d, [rsi+1024+r11*4]
  1019. xor r10d, eax
  1020. imul ecx, 54655307h
  1021. rol ecx, 7
  1022. mov [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4], r10d
  1023. mov eax, [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4]
  1024. mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4 + 80*4], eax
  1025. rol eax, 8
  1026. lea r11d, [r10d + edx]
  1027. xor r11d, ecx
  1028. mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4], r11d
  1029. mov r11d, 1
  1030. and r11d, edx
  1031. neg r11d
  1032. and r11d, r10d
  1033. xor r10d, eax
  1034. movzx eax, al
  1035. xor r10d, [rsi+rax*4]
  1036. mov eax, [rsp + 8*8 + ((12+3)-((12+3)/(10))*(10))*4]
  1037. xor r11d, [rsp + 8*8 + ((12+2)-((12+2)/(10))*(10))*4]
  1038. add ecx, r11d
  1039. movzx r11d, al
  1040. shr eax, 8
  1041. xor r10d, [rsi+1024+r11*4]
  1042. xor r10d, eax
  1043. imul edx, 54655307h
  1044. rol edx, 7
  1045. mov [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4], r10d
  1046. mov eax, [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4]
  1047. mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4 + 80*4], eax
  1048. rol eax, 8
  1049. lea r11d, [r10d + ecx]
  1050. xor r11d, edx
  1051. mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4], r11d
  1052. mov r11d, 1
  1053. and r11d, ecx
  1054. neg r11d
  1055. and r11d, r10d
  1056. xor r10d, eax
  1057. movzx eax, al
  1058. xor r10d, [rsi+rax*4]
  1059. mov eax, [rsp + 8*8 + ((13+3)-((13+3)/(10))*(10))*4]
  1060. xor r11d, [rsp + 8*8 + ((13+2)-((13+2)/(10))*(10))*4]
  1061. add edx, r11d
  1062. movzx r11d, al
  1063. shr eax, 8
  1064. xor r10d, [rsi+1024+r11*4]
  1065. xor r10d, eax
  1066. imul ecx, 54655307h
  1067. rol ecx, 7
  1068. mov [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4], r10d
  1069. mov eax, [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4]
  1070. mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4 + 80*4], eax
  1071. rol eax, 8
  1072. lea r11d, [r10d + edx]
  1073. xor r11d, ecx
  1074. mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4], r11d
  1075. mov r11d, 1
  1076. and r11d, edx
  1077. neg r11d
  1078. and r11d, r10d
  1079. xor r10d, eax
  1080. movzx eax, al
  1081. xor r10d, [rsi+rax*4]
  1082. mov eax, [rsp + 8*8 + ((14+3)-((14+3)/(10))*(10))*4]
  1083. xor r11d, [rsp + 8*8 + ((14+2)-((14+2)/(10))*(10))*4]
  1084. add ecx, r11d
  1085. movzx r11d, al
  1086. shr eax, 8
  1087. xor r10d, [rsi+1024+r11*4]
  1088. xor r10d, eax
  1089. imul edx, 54655307h
  1090. rol edx, 7
  1091. mov [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4], r10d
  1092. mov eax, [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4]
  1093. mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4 + 80*4], eax
  1094. rol eax, 8
  1095. lea r11d, [r10d + ecx]
  1096. xor r11d, edx
  1097. mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4], r11d
  1098. mov r11d, 1
  1099. and r11d, ecx
  1100. neg r11d
  1101. and r11d, r10d
  1102. xor r10d, eax
  1103. movzx eax, al
  1104. xor r10d, [rsi+rax*4]
  1105. mov eax, [rsp + 8*8 + ((15+3)-((15+3)/(10))*(10))*4]
  1106. xor r11d, [rsp + 8*8 + ((15+2)-((15+2)/(10))*(10))*4]
  1107. add edx, r11d
  1108. movzx r11d, al
  1109. shr eax, 8
  1110. xor r10d, [rsi+1024+r11*4]
  1111. xor r10d, eax
  1112. imul ecx, 54655307h
  1113. rol ecx, 7
  1114. mov [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4], r10d
  1115. mov eax, [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4]
  1116. mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4 + 80*4], eax
  1117. rol eax, 8
  1118. lea r11d, [r10d + edx]
  1119. xor r11d, ecx
  1120. mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4], r11d
  1121. mov r11d, 1
  1122. and r11d, edx
  1123. neg r11d
  1124. and r11d, r10d
  1125. xor r10d, eax
  1126. movzx eax, al
  1127. xor r10d, [rsi+rax*4]
  1128. mov eax, [rsp + 8*8 + ((16+3)-((16+3)/(10))*(10))*4]
  1129. xor r11d, [rsp + 8*8 + ((16+2)-((16+2)/(10))*(10))*4]
  1130. add ecx, r11d
  1131. movzx r11d, al
  1132. shr eax, 8
  1133. xor r10d, [rsi+1024+r11*4]
  1134. xor r10d, eax
  1135. imul edx, 54655307h
  1136. rol edx, 7
  1137. mov [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4], r10d
  1138. mov eax, [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4]
  1139. mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4 + 80*4], eax
  1140. rol eax, 8
  1141. lea r11d, [r10d + ecx]
  1142. xor r11d, edx
  1143. mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4], r11d
  1144. mov r11d, 1
  1145. and r11d, ecx
  1146. neg r11d
  1147. and r11d, r10d
  1148. xor r10d, eax
  1149. movzx eax, al
  1150. xor r10d, [rsi+rax*4]
  1151. mov eax, [rsp + 8*8 + ((17+3)-((17+3)/(10))*(10))*4]
  1152. xor r11d, [rsp + 8*8 + ((17+2)-((17+2)/(10))*(10))*4]
  1153. add edx, r11d
  1154. movzx r11d, al
  1155. shr eax, 8
  1156. xor r10d, [rsi+1024+r11*4]
  1157. xor r10d, eax
  1158. imul ecx, 54655307h
  1159. rol ecx, 7
  1160. mov [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4], r10d
  1161. mov eax, [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4]
  1162. mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4 + 80*4], eax
  1163. rol eax, 8
  1164. lea r11d, [r10d + edx]
  1165. xor r11d, ecx
  1166. mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4], r11d
  1167. mov r11d, 1
  1168. and r11d, edx
  1169. neg r11d
  1170. and r11d, r10d
  1171. xor r10d, eax
  1172. movzx eax, al
  1173. xor r10d, [rsi+rax*4]
  1174. mov eax, [rsp + 8*8 + ((18+3)-((18+3)/(10))*(10))*4]
  1175. xor r11d, [rsp + 8*8 + ((18+2)-((18+2)/(10))*(10))*4]
  1176. add ecx, r11d
  1177. movzx r11d, al
  1178. shr eax, 8
  1179. xor r10d, [rsi+1024+r11*4]
  1180. xor r10d, eax
  1181. imul edx, 54655307h
  1182. rol edx, 7
  1183. mov [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4], r10d
  1184. mov eax, [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4]
  1185. mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4 + 80*4], eax
  1186. rol eax, 8
  1187. lea r11d, [r10d + ecx]
  1188. xor r11d, edx
  1189. mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4], r11d
  1190. mov r11d, 1
  1191. and r11d, ecx
  1192. neg r11d
  1193. and r11d, r10d
  1194. xor r10d, eax
  1195. movzx eax, al
  1196. xor r10d, [rsi+rax*4]
  1197. mov eax, [rsp + 8*8 + ((19+3)-((19+3)/(10))*(10))*4]
  1198. xor r11d, [rsp + 8*8 + ((19+2)-((19+2)/(10))*(10))*4]
  1199. add edx, r11d
  1200. movzx r11d, al
  1201. shr eax, 8
  1202. xor r10d, [rsi+1024+r11*4]
  1203. xor r10d, eax
  1204. imul ecx, 54655307h
  1205. rol ecx, 7
  1206. mov [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4], r10d
  1207. add rdi, 5*4
  1208. cmp rdi, QWORD PTR [rsp+4*8]
  1209. jne label0
  1210. mov rax, QWORD PTR [rsp+2*8]
  1211. mov r11, QWORD PTR [rsp+1*8]
  1212. lea rdi, [rsp + 8*8 + 12*4]
  1213. mov rsi, QWORD PTR [rsp+7*8]
  1214. label1:
  1215. movdqa xmm0, [rdi+0*20*4]
  1216. movdqa xmm2, [rdi+2*20*4]
  1217. movdqa xmm3, [rdi+3*20*4]
  1218. movdqa xmm1, [rdi+1*20*4]
  1219. movdqa xmm4, xmm0
  1220. pand xmm0, xmm2
  1221. pxor xmm0, xmm3
  1222. pxor xmm2, xmm1
  1223. pxor xmm2, xmm0
  1224. por xmm3, xmm4
  1225. pxor xmm3, xmm1
  1226. pxor xmm4, xmm2
  1227. movdqa xmm1, xmm3
  1228. por xmm3, xmm4
  1229. pxor xmm3, xmm0
  1230. pand xmm0, xmm1
  1231. pxor xmm4, xmm0
  1232. pxor xmm1, xmm3
  1233. pxor xmm1, xmm4
  1234. pxor xmm4, xmm7
  1235. pxor xmm2, [rdi+80*4]
  1236. pxor xmm3, [rdi+80*5]
  1237. pxor xmm1, [rdi+80*6]
  1238. pxor xmm4, [rdi+80*7]
  1239. cmp rsi, 16
  1240. jl label4
  1241. movdqa xmm6, xmm2
  1242. punpckldq xmm2, xmm3
  1243. movdqa xmm5, xmm1
  1244. punpckldq xmm1, xmm4
  1245. movdqa xmm0, xmm2
  1246. punpcklqdq xmm2, xmm1
  1247. punpckhqdq xmm0, xmm1
  1248. punpckhdq xmm6, xmm3
  1249. punpckhdq xmm5, xmm4
  1250. movdqa xmm3, xmm6
  1251. punpcklqdq xmm6, xmm5
  1252. punpckhqdq xmm3, xmm5
  1253. test rax, rax
  1254. jz labelSSE2_Sosemanuk_Output3
  1255. test rax, 15
  1256. jnz labelSSE2_Sosemanuk_Output7
  1257. pxor xmm2, [rax+0*16]
  1258. pxor xmm0, [rax+1*16]
  1259. pxor xmm6, [rax+2*16]
  1260. pxor xmm3, [rax+3*16]
  1261. add rax, 4*16
  1262. jmp labelSSE2_Sosemanuk_Output3
  1263. labelSSE2_Sosemanuk_Output7:
  1264. movdqu xmm1, [rax+0*16]
  1265. pxor xmm2, xmm1
  1266. movdqu xmm1, [rax+1*16]
  1267. pxor xmm0, xmm1
  1268. movdqu xmm1, [rax+2*16]
  1269. pxor xmm6, xmm1
  1270. movdqu xmm1, [rax+3*16]
  1271. pxor xmm3, xmm1
  1272. add rax, 4*16
  1273. labelSSE2_Sosemanuk_Output3:
  1274. test r11, 15
  1275. jnz labelSSE2_Sosemanuk_Output8
  1276. movdqa [r11+0*16], xmm2
  1277. movdqa [r11+1*16], xmm0
  1278. movdqa [r11+2*16], xmm6
  1279. movdqa [r11+3*16], xmm3
  1280. jmp labelSSE2_Sosemanuk_Output9
  1281. labelSSE2_Sosemanuk_Output8:
  1282. movdqu [r11+0*16], xmm2
  1283. movdqu [r11+1*16], xmm0
  1284. movdqu [r11+2*16], xmm6
  1285. movdqu [r11+3*16], xmm3
  1286. labelSSE2_Sosemanuk_Output9:
  1287. add r11, 4*16
  1288. add rdi, 4*4
  1289. sub rsi, 16
  1290. jnz label1
  1291. mov rsi, QWORD PTR [rsp+3*8]
  1292. sub rsi, 80
  1293. jz label6
  1294. mov QWORD PTR [rsp+3*8], rsi
  1295. mov QWORD PTR [rsp+2*8], rax
  1296. mov QWORD PTR [rsp+1*8], r11
  1297. jmp label2
  1298. label4:
  1299. test rax, rax
  1300. jz label5
  1301. movd xmm0, dword ptr [rax+0*4]
  1302. pxor xmm2, xmm0
  1303. movd xmm0, dword ptr [rax+1*4]
  1304. pxor xmm3, xmm0
  1305. movd xmm0, dword ptr [rax+2*4]
  1306. pxor xmm1, xmm0
  1307. movd xmm0, dword ptr [rax+3*4]
  1308. pxor xmm4, xmm0
  1309. add rax, 16
  1310. label5:
  1311. movd dword ptr [r11+0*4], xmm2
  1312. movd dword ptr [r11+1*4], xmm3
  1313. movd dword ptr [r11+2*4], xmm1
  1314. movd dword ptr [r11+3*4], xmm4
  1315. sub rsi, 4
  1316. jz label6
  1317. add r11, 16
  1318. psrldq xmm2, 4
  1319. psrldq xmm3, 4
  1320. psrldq xmm1, 4
  1321. psrldq xmm4, 4
  1322. jmp label4
  1323. label6:
  1324. mov r10, QWORD PTR [rsp+6*8]
  1325. movdqa xmm0, [rsp + 8*8+0*16]
  1326. movdqa [r10+0*16], xmm0
  1327. movdqa xmm0, [rsp + 8*8+1*16]
  1328. movdqa [r10+1*16], xmm0
  1329. movq xmm0, QWORD PTR [rsp + 8*8+2*16]
  1330. movq QWORD PTR [r10+2*16], xmm0
  1331. mov [r10+10*4], ecx
  1332. mov [r10+11*4], edx
  1333. movdqa xmm6, [rsp + 02f0h]
  1334. movdqa xmm7, [rsp + 0300h]
  1335. add rsp, 80*4*2+12*4+8*8 + 2*16+8
  1336. pop rdi
  1337. pop rsi
  1338. ret
  1339. Sosemanuk_OperateKeystream ENDP
  1340. Panama_SSE2_Pull PROC FRAME
  1341. rex_push_reg rdi
  1342. alloc_stack(2*16)
  1343. save_xmm128 xmm6, 0h
  1344. save_xmm128 xmm7, 10h
  1345. .endprolog
  1346. shl rcx, 5
  1347. jz label5
  1348. mov r10d, [rdx+4*17]
  1349. add rcx, r10
  1350. mov rdi, rcx
  1351. movdqa xmm0, xmmword ptr [rdx+0*16]
  1352. movdqa xmm1, xmmword ptr [rdx+1*16]
  1353. movdqa xmm2, xmmword ptr [rdx+2*16]
  1354. movdqa xmm3, xmmword ptr [rdx+3*16]
  1355. mov eax, dword ptr [rdx+4*16]
  1356. label4:
  1357. movdqa xmm6, xmm2
  1358. movss xmm6, xmm3
  1359. pshufd xmm5, xmm6, 0*64+3*16+2*4+1
  1360. movd xmm6, eax
  1361. movdqa xmm7, xmm3
  1362. movss xmm7, xmm6
  1363. pshufd xmm6, xmm7, 0*64+3*16+2*4+1
  1364. movd ecx, xmm2
  1365. not ecx
  1366. movd r11d, xmm3
  1367. or ecx, r11d
  1368. xor eax, ecx
  1369. pcmpeqb xmm7, xmm7
  1370. pxor xmm7, xmm1
  1371. por xmm7, xmm2
  1372. pxor xmm7, xmm3
  1373. movd ecx, xmm7
  1374. rol ecx, (((((5*1) MOD (17))*(((5*1) MOD (17))+1)/2)) MOD (32))
  1375. mov [rdx+((((((5*(1)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1376. pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
  1377. movd ecx, xmm7
  1378. rol ecx, (((((5*5) MOD (17))*(((5*5) MOD (17))+1)/2)) MOD (32))
  1379. mov [rdx+((((((5*(5)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1380. punpckhqdq xmm7, xmm7
  1381. movd ecx, xmm7
  1382. rol ecx, (((((5*9) MOD (17))*(((5*9) MOD (17))+1)/2)) MOD (32))
  1383. mov [rdx+((((((5*(9)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1384. pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
  1385. movd ecx, xmm7
  1386. rol ecx, (((((5*13) MOD (17))*(((5*13) MOD (17))+1)/2)) MOD (32))
  1387. mov [rdx+((((((5*(13)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1388. pcmpeqb xmm7, xmm7
  1389. pxor xmm7, xmm0
  1390. por xmm7, xmm1
  1391. pxor xmm7, xmm2
  1392. movd ecx, xmm7
  1393. rol ecx, (((((5*2) MOD (17))*(((5*2) MOD (17))+1)/2)) MOD (32))
  1394. mov [rdx+((((((5*(2)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1395. pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
  1396. movd ecx, xmm7
  1397. rol ecx, (((((5*6) MOD (17))*(((5*6) MOD (17))+1)/2)) MOD (32))
  1398. mov [rdx+((((((5*(6)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1399. punpckhqdq xmm7, xmm7
  1400. movd ecx, xmm7
  1401. rol ecx, (((((5*10) MOD (17))*(((5*10) MOD (17))+1)/2)) MOD (32))
  1402. mov [rdx+((((((5*(10)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1403. pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
  1404. movd ecx, xmm7
  1405. rol ecx, (((((5*14) MOD (17))*(((5*14) MOD (17))+1)/2)) MOD (32))
  1406. mov [rdx+((((((5*(14)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1407. pcmpeqb xmm7, xmm7
  1408. pxor xmm7, xmm6
  1409. por xmm7, xmm0
  1410. pxor xmm7, xmm1
  1411. movd ecx, xmm7
  1412. rol ecx, (((((5*3) MOD (17))*(((5*3) MOD (17))+1)/2)) MOD (32))
  1413. mov [rdx+((((((5*(3)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1414. pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
  1415. movd ecx, xmm7
  1416. rol ecx, (((((5*7) MOD (17))*(((5*7) MOD (17))+1)/2)) MOD (32))
  1417. mov [rdx+((((((5*(7)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1418. punpckhqdq xmm7, xmm7
  1419. movd ecx, xmm7
  1420. rol ecx, (((((5*11) MOD (17))*(((5*11) MOD (17))+1)/2)) MOD (32))
  1421. mov [rdx+((((((5*(11)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1422. pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
  1423. movd ecx, xmm7
  1424. rol ecx, (((((5*15) MOD (17))*(((5*15) MOD (17))+1)/2)) MOD (32))
  1425. mov [rdx+((((((5*(15)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1426. pcmpeqb xmm7, xmm7
  1427. pxor xmm7, xmm5
  1428. por xmm7, xmm6
  1429. pxor xmm7, xmm0
  1430. movd ecx, xmm7
  1431. rol ecx, (((((5*4) MOD (17))*(((5*4) MOD (17))+1)/2)) MOD (32))
  1432. mov [rdx+((((((5*(4)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1433. pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
  1434. movd ecx, xmm7
  1435. rol ecx, (((((5*8) MOD (17))*(((5*8) MOD (17))+1)/2)) MOD (32))
  1436. mov [rdx+((((((5*(8)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1437. punpckhqdq xmm7, xmm7
  1438. movd ecx, xmm7
  1439. rol ecx, (((((5*12) MOD (17))*(((5*12) MOD (17))+1)/2)) MOD (32))
  1440. mov [rdx+((((((5*(12)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1441. pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
  1442. movd ecx, xmm7
  1443. rol ecx, (((((5*16) MOD (17))*(((5*16) MOD (17))+1)/2)) MOD (32))
  1444. mov [rdx+((((((5*(16)) MOD (17)))*13+16)) MOD (17))*4], ecx
  1445. movdqa xmm4, xmm3
  1446. punpcklqdq xmm3, xmm2
  1447. punpckhdq xmm4, xmm2
  1448. movdqa xmm2, xmm1
  1449. punpcklqdq xmm1, xmm0
  1450. punpckhdq xmm2, xmm0
  1451. test r8, r8
  1452. jz label0
  1453. movdqa xmm6, xmm4
  1454. punpcklqdq xmm4, xmm2
  1455. punpckhqdq xmm6, xmm2
  1456. test r9, 15
  1457. jnz label2
  1458. test r9, r9
  1459. jz label1
  1460. pxor xmm4, [r9]
  1461. pxor xmm6, [r9+16]
  1462. add r9, 32
  1463. jmp label1
  1464. label2:
  1465. movdqu xmm0, [r9]
  1466. movdqu xmm2, [r9+16]
  1467. pxor xmm4, xmm0
  1468. pxor xmm6, xmm2
  1469. add r9, 32
  1470. label1:
  1471. test r8, 15
  1472. jnz label3
  1473. movdqa xmmword ptr [r8], xmm4
  1474. movdqa xmmword ptr [r8+16], xmm6
  1475. add r8, 32
  1476. jmp label0
  1477. label3:
  1478. movdqu xmmword ptr [r8], xmm4
  1479. movdqu xmmword ptr [r8+16], xmm6
  1480. add r8, 32
  1481. label0:
  1482. lea rcx, [r10 + 32]
  1483. and rcx, 31*32
  1484. lea r11, [r10 + (32-24)*32]
  1485. and r11, 31*32
  1486. movdqa xmm0, xmmword ptr [rdx+20*4+rcx+0*8]
  1487. pxor xmm3, xmm0
  1488. pshufd xmm0, xmm0, 2*64+3*16+0*4+1
  1489. movdqa xmmword ptr [rdx+20*4+rcx+0*8], xmm3
  1490. pxor xmm0, xmmword ptr [rdx+20*4+r11+2*8]
  1491. movdqa xmmword ptr [rdx+20*4+r11+2*8], xmm0
  1492. movdqa xmm4, xmmword ptr [rdx+20*4+rcx+2*8]
  1493. pxor xmm1, xmm4
  1494. movdqa xmmword ptr [rdx+20*4+rcx+2*8], xmm1
  1495. pxor xmm4, xmmword ptr [rdx+20*4+r11+0*8]
  1496. movdqa xmmword ptr [rdx+20*4+r11+0*8], xmm4
  1497. movdqa xmm3, xmmword ptr [rdx+3*16]
  1498. movdqa xmm2, xmmword ptr [rdx+2*16]
  1499. movdqa xmm1, xmmword ptr [rdx+1*16]
  1500. movdqa xmm0, xmmword ptr [rdx+0*16]
  1501. movd xmm6, eax
  1502. movdqa xmm7, xmm3
  1503. movss xmm7, xmm6
  1504. movdqa xmm6, xmm2
  1505. movss xmm6, xmm3
  1506. movdqa xmm5, xmm1
  1507. movss xmm5, xmm2
  1508. movdqa xmm4, xmm0
  1509. movss xmm4, xmm1
  1510. pshufd xmm7, xmm7, 0*64+3*16+2*4+1
  1511. pshufd xmm6, xmm6, 0*64+3*16+2*4+1
  1512. pshufd xmm5, xmm5, 0*64+3*16+2*4+1
  1513. pshufd xmm4, xmm4, 0*64+3*16+2*4+1
  1514. xor eax, 1
  1515. movd ecx, xmm0
  1516. xor eax, ecx
  1517. movd ecx, xmm3
  1518. xor eax, ecx
  1519. pxor xmm3, xmm2
  1520. pxor xmm2, xmm1
  1521. pxor xmm1, xmm0
  1522. pxor xmm0, xmm7
  1523. pxor xmm3, xmm7
  1524. pxor xmm2, xmm6
  1525. pxor xmm1, xmm5
  1526. pxor xmm0, xmm4
  1527. lea rcx, [r10 + (32-4)*32]
  1528. and rcx, 31*32
  1529. lea r11, [r10 + 16*32]
  1530. and r11, 31*32
  1531. movdqa xmm4, xmmword ptr [rdx+20*4+rcx+0*16]
  1532. movdqa xmm5, xmmword ptr [rdx+20*4+r11+0*16]
  1533. movdqa xmm6, xmm4
  1534. punpcklqdq xmm4, xmm5
  1535. punpckhqdq xmm6, xmm5
  1536. pxor xmm3, xmm4
  1537. pxor xmm2, xmm6
  1538. movdqa xmm4, xmmword ptr [rdx+20*4+rcx+1*16]
  1539. movdqa xmm5, xmmword ptr [rdx+20*4+r11+1*16]
  1540. movdqa xmm6, xmm4
  1541. punpcklqdq xmm4, xmm5
  1542. punpckhqdq xmm6, xmm5
  1543. pxor xmm1, xmm4
  1544. pxor xmm0, xmm6
  1545. add r10, 32
  1546. cmp r10, rdi
  1547. jne label4
  1548. mov [rdx+4*16], eax
  1549. movdqa xmmword ptr [rdx+3*16], xmm3
  1550. movdqa xmmword ptr [rdx+2*16], xmm2
  1551. movdqa xmmword ptr [rdx+1*16], xmm1
  1552. movdqa xmmword ptr [rdx+0*16], xmm0
  1553. label5:
  1554. movdqa xmm6, [rsp + 0h]
  1555. movdqa xmm7, [rsp + 10h]
  1556. add rsp, 2*16
  1557. pop rdi
  1558. ret
  1559. Panama_SSE2_Pull ENDP
  1560. _TEXT ENDS
  1561. END