Team Fortress 2 Source Code as on 22/4/2020
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

910 lines
24 KiB

  1. // vmac.cpp - written and placed in the public domain by Wei Dai
  2. // based on Ted Krovetz's public domain vmac.c and draft-krovetz-vmac-01.txt
  3. #include "pch.h"
  4. #include "config.h"
  5. #include "vmac.h"
  6. #include "cpu.h"
  7. #include "argnames.h"
  8. #include "secblock.h"
  9. #if CRYPTOPP_MSC_VERSION
  10. # pragma warning(disable: 4731)
  11. #endif
  12. NAMESPACE_BEGIN(CryptoPP)
  13. #if defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64
  14. #include <intrin.h>
  15. #endif
  16. #define VMAC_BOOL_WORD128 (defined(CRYPTOPP_WORD128_AVAILABLE) && !defined(CRYPTOPP_X64_ASM_AVAILABLE))
  17. #ifdef __BORLANDC__
  18. #define const // Turbo C++ 2006 workaround
  19. #endif
  20. static const word64 p64 = W64LIT(0xfffffffffffffeff); /* 2^64 - 257 prime */
  21. static const word64 m62 = W64LIT(0x3fffffffffffffff); /* 62-bit mask */
  22. static const word64 m63 = W64LIT(0x7fffffffffffffff); /* 63-bit mask */
  23. static const word64 m64 = W64LIT(0xffffffffffffffff); /* 64-bit mask */
  24. static const word64 mpoly = W64LIT(0x1fffffff1fffffff); /* Poly key mask */
  25. #ifdef __BORLANDC__
  26. #undef const
  27. #endif
  28. #if VMAC_BOOL_WORD128
  29. #ifdef __powerpc__
  30. // workaround GCC Bug 31690: ICE with const __uint128_t and C++ front-end
  31. #define m126 ((word128(m62)<<64)|m64)
  32. #else
  33. static const word128 m126 = (word128(m62)<<64)|m64; /* 126-bit mask */
  34. #endif
  35. #endif
  36. void VMAC_Base::UncheckedSetKey(const byte *userKey, unsigned int keylength, const NameValuePairs &params)
  37. {
  38. int digestLength = params.GetIntValueWithDefault(Name::DigestSize(), DefaultDigestSize());
  39. if (digestLength != 8 && digestLength != 16)
  40. throw InvalidArgument("VMAC: DigestSize must be 8 or 16");
  41. m_is128 = digestLength == 16;
  42. m_L1KeyLength = params.GetIntValueWithDefault(Name::L1KeyLength(), 128);
  43. if (m_L1KeyLength <= 0 || m_L1KeyLength % 128 != 0)
  44. throw InvalidArgument("VMAC: L1KeyLength must be a positive multiple of 128");
  45. AllocateBlocks();
  46. BlockCipher &cipher = AccessCipher();
  47. cipher.SetKey(userKey, keylength, params);
  48. const unsigned int blockSize = cipher.BlockSize();
  49. const unsigned int blockSizeInWords = blockSize / sizeof(word64);
  50. SecBlock<word64> out(blockSizeInWords);
  51. SecByteBlock in;
  52. in.CleanNew(blockSize);
  53. size_t i;
  54. /* Fill nh key */
  55. in[0] = 0x80;
  56. cipher.AdvancedProcessBlocks(in, NULL, (byte *)m_nhKey(), m_nhKeySize()*sizeof(word64), cipher.BT_InBlockIsCounter);
  57. ConditionalByteReverse<word64>(BIG_ENDIAN_ORDER, m_nhKey(), m_nhKey(), m_nhKeySize()*sizeof(word64));
  58. /* Fill poly key */
  59. in[0] = 0xC0;
  60. in[15] = 0;
  61. for (i = 0; i <= (size_t)m_is128; i++)
  62. {
  63. cipher.ProcessBlock(in, out.BytePtr());
  64. m_polyState()[i*4+2] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()) & mpoly;
  65. m_polyState()[i*4+3] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8) & mpoly;
  66. in[15]++;
  67. }
  68. /* Fill ip key */
  69. in[0] = 0xE0;
  70. in[15] = 0;
  71. word64 *l3Key = m_l3Key();
  72. for (i = 0; i <= (size_t)m_is128; i++)
  73. do
  74. {
  75. cipher.ProcessBlock(in, out.BytePtr());
  76. l3Key[i*2+0] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr());
  77. l3Key[i*2+1] = GetWord<word64>(true, BIG_ENDIAN_ORDER, out.BytePtr()+8);
  78. in[15]++;
  79. } while ((l3Key[i*2+0] >= p64) || (l3Key[i*2+1] >= p64));
  80. m_padCached = false;
  81. size_t nonceLength;
  82. const byte *nonce = GetIVAndThrowIfInvalid(params, nonceLength);
  83. Resynchronize(nonce, (int)nonceLength);
  84. }
  85. void VMAC_Base::GetNextIV(RandomNumberGenerator &rng, byte *IV)
  86. {
  87. SimpleKeyingInterface::GetNextIV(rng, IV);
  88. IV[0] &= 0x7f;
  89. }
  90. void VMAC_Base::Resynchronize(const byte *nonce, int len)
  91. {
  92. size_t length = ThrowIfInvalidIVLength(len);
  93. size_t s = IVSize();
  94. byte *storedNonce = m_nonce();
  95. if (m_is128)
  96. {
  97. memset(storedNonce, 0, s-length);
  98. memcpy(storedNonce+s-length, nonce, length);
  99. AccessCipher().ProcessBlock(storedNonce, m_pad());
  100. }
  101. else
  102. {
  103. if (m_padCached && (storedNonce[s-1] | 1) == (nonce[length-1] | 1))
  104. {
  105. m_padCached = VerifyBufsEqual(storedNonce+s-length, nonce, length-1);
  106. for (size_t i=0; m_padCached && i<s-length; i++)
  107. m_padCached = (storedNonce[i] == 0);
  108. }
  109. if (!m_padCached)
  110. {
  111. memset(storedNonce, 0, s-length);
  112. memcpy(storedNonce+s-length, nonce, length-1);
  113. storedNonce[s-1] = nonce[length-1] & 0xfe;
  114. AccessCipher().ProcessBlock(storedNonce, m_pad());
  115. m_padCached = true;
  116. }
  117. storedNonce[s-1] = nonce[length-1];
  118. }
  119. m_isFirstBlock = true;
  120. Restart();
  121. }
  122. void VMAC_Base::HashEndianCorrectedBlock(const word64 *data)
  123. {
  124. CRYPTOPP_UNUSED(data);
  125. assert(false);
  126. throw NotImplemented("VMAC: HashEndianCorrectedBlock is not implemented");
  127. }
  128. unsigned int VMAC_Base::OptimalDataAlignment() const
  129. {
  130. return
  131. #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)) && !defined(CRYPTOPP_DISABLE_VMAC_ASM)
  132. HasSSE2() ? 16 :
  133. #endif
  134. GetCipher().OptimalDataAlignment();
  135. }
  136. #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || (CRYPTOPP_BOOL_X32 && !defined(CRYPTOPP_DISABLE_VMAC_ASM))))
  137. #if CRYPTOPP_MSC_VERSION
  138. # pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
  139. #endif
  140. void
  141. #ifdef __GNUC__
  142. __attribute__ ((noinline)) // Intel Compiler 9.1 workaround
  143. #endif
  144. VMAC_Base::VHASH_Update_SSE2(const word64 *data, size_t blocksRemainingInWord64, int tagPart)
  145. {
  146. const word64 *nhK = m_nhKey();
  147. word64 *polyS = m_polyState();
  148. word32 L1KeyLength = m_L1KeyLength;
  149. CRYPTOPP_UNUSED(data); CRYPTOPP_UNUSED(tagPart); CRYPTOPP_UNUSED(L1KeyLength);
  150. CRYPTOPP_UNUSED(blocksRemainingInWord64);
  151. #ifdef __GNUC__
  152. word32 temp;
  153. __asm__ __volatile__
  154. (
  155. AS2( mov %%ebx, %0)
  156. AS2( mov %1, %%ebx)
  157. INTEL_NOPREFIX
  158. #else
  159. #if _MSC_VER < 1300 || defined(__INTEL_COMPILER)
  160. char isFirstBlock = m_isFirstBlock;
  161. AS2( mov ebx, [L1KeyLength])
  162. AS2( mov dl, [isFirstBlock])
  163. #else
  164. AS2( mov ecx, this)
  165. AS2( mov ebx, [ecx+m_L1KeyLength])
  166. AS2( mov dl, [ecx+m_isFirstBlock])
  167. #endif
  168. AS2( mov eax, tagPart)
  169. AS2( shl eax, 4)
  170. AS2( mov edi, nhK)
  171. AS2( add edi, eax)
  172. AS2( add eax, eax)
  173. AS2( add eax, polyS)
  174. AS2( mov esi, data)
  175. AS2( mov ecx, blocksRemainingInWord64)
  176. #endif
  177. AS2( shr ebx, 3)
  178. #if CRYPTOPP_BOOL_X32
  179. AS_PUSH_IF86( bp)
  180. AS2( sub esp, 24)
  181. #else
  182. AS_PUSH_IF86( bp)
  183. AS2( sub esp, 12)
  184. #endif
  185. ASL(4)
  186. AS2( mov ebp, ebx)
  187. AS2( cmp ecx, ebx)
  188. AS2( cmovl ebp, ecx)
  189. AS2( sub ecx, ebp)
  190. AS2( lea ebp, [edi+8*ebp]) // end of nhK
  191. AS2( movq mm6, [esi])
  192. AS2( paddq mm6, [edi])
  193. AS2( movq mm5, [esi+8])
  194. AS2( paddq mm5, [edi+8])
  195. AS2( add esi, 16)
  196. AS2( add edi, 16)
  197. AS2( movq mm4, mm6)
  198. ASS( pshufw mm2, mm6, 1, 0, 3, 2)
  199. AS2( pmuludq mm6, mm5)
  200. ASS( pshufw mm3, mm5, 1, 0, 3, 2)
  201. AS2( pmuludq mm5, mm2)
  202. AS2( pmuludq mm2, mm3)
  203. AS2( pmuludq mm3, mm4)
  204. AS2( pxor mm7, mm7)
  205. AS2( movd [esp], mm6)
  206. AS2( psrlq mm6, 32)
  207. #if CRYPTOPP_BOOL_X32
  208. AS2( movd [esp+8], mm5)
  209. #else
  210. AS2( movd [esp+4], mm5)
  211. #endif
  212. AS2( psrlq mm5, 32)
  213. AS2( cmp edi, ebp)
  214. ASJ( je, 1, f)
  215. ASL(0)
  216. AS2( movq mm0, [esi])
  217. AS2( paddq mm0, [edi])
  218. AS2( movq mm1, [esi+8])
  219. AS2( paddq mm1, [edi+8])
  220. AS2( add esi, 16)
  221. AS2( add edi, 16)
  222. AS2( movq mm4, mm0)
  223. AS2( paddq mm5, mm2)
  224. ASS( pshufw mm2, mm0, 1, 0, 3, 2)
  225. AS2( pmuludq mm0, mm1)
  226. #if CRYPTOPP_BOOL_X32
  227. AS2( movd [esp+16], mm3)
  228. #else
  229. AS2( movd [esp+8], mm3)
  230. #endif
  231. AS2( psrlq mm3, 32)
  232. AS2( paddq mm5, mm3)
  233. ASS( pshufw mm3, mm1, 1, 0, 3, 2)
  234. AS2( pmuludq mm1, mm2)
  235. AS2( pmuludq mm2, mm3)
  236. AS2( pmuludq mm3, mm4)
  237. AS2( movd mm4, [esp])
  238. AS2( paddq mm7, mm4)
  239. #if CRYPTOPP_BOOL_X32
  240. AS2( movd mm4, [esp+8])
  241. AS2( paddq mm6, mm4)
  242. AS2( movd mm4, [esp+16])
  243. #else
  244. AS2( movd mm4, [esp+4])
  245. AS2( paddq mm6, mm4)
  246. AS2( movd mm4, [esp+8])
  247. #endif
  248. AS2( paddq mm6, mm4)
  249. AS2( movd [esp], mm0)
  250. AS2( psrlq mm0, 32)
  251. AS2( paddq mm6, mm0)
  252. #if CRYPTOPP_BOOL_X32
  253. AS2( movd [esp+8], mm1)
  254. #else
  255. AS2( movd [esp+4], mm1)
  256. #endif
  257. AS2( psrlq mm1, 32)
  258. AS2( paddq mm5, mm1)
  259. AS2( cmp edi, ebp)
  260. ASJ( jne, 0, b)
  261. ASL(1)
  262. AS2( paddq mm5, mm2)
  263. #if CRYPTOPP_BOOL_X32
  264. AS2( movd [esp+16], mm3)
  265. #else
  266. AS2( movd [esp+8], mm3)
  267. #endif
  268. AS2( psrlq mm3, 32)
  269. AS2( paddq mm5, mm3)
  270. AS2( movd mm4, [esp])
  271. AS2( paddq mm7, mm4)
  272. #if CRYPTOPP_BOOL_X32
  273. AS2( movd mm4, [esp+8])
  274. AS2( paddq mm6, mm4)
  275. AS2( movd mm4, [esp+16])
  276. #else
  277. AS2( movd mm4, [esp+4])
  278. AS2( paddq mm6, mm4)
  279. AS2( movd mm4, [esp+8])
  280. #endif
  281. AS2( paddq mm6, mm4)
  282. AS2( lea ebp, [8*ebx])
  283. AS2( sub edi, ebp) // reset edi to start of nhK
  284. AS2( movd [esp], mm7)
  285. AS2( psrlq mm7, 32)
  286. AS2( paddq mm6, mm7)
  287. #if CRYPTOPP_BOOL_X32
  288. AS2( movd [esp+8], mm6)
  289. #else
  290. AS2( movd [esp+4], mm6)
  291. #endif
  292. AS2( psrlq mm6, 32)
  293. AS2( paddq mm5, mm6)
  294. AS2( psllq mm5, 2)
  295. AS2( psrlq mm5, 2)
  296. #define a0 [eax+2*4]
  297. #define a1 [eax+3*4]
  298. #define a2 [eax+0*4]
  299. #define a3 [eax+1*4]
  300. #define k0 [eax+2*8+2*4]
  301. #define k1 [eax+2*8+3*4]
  302. #define k2 [eax+2*8+0*4]
  303. #define k3 [eax+2*8+1*4]
  304. AS2( test dl, dl)
  305. ASJ( jz, 2, f)
  306. AS2( movd mm1, k0)
  307. AS2( movd mm0, [esp])
  308. AS2( paddq mm0, mm1)
  309. AS2( movd a0, mm0)
  310. AS2( psrlq mm0, 32)
  311. AS2( movd mm1, k1)
  312. #if CRYPTOPP_BOOL_X32
  313. AS2( movd mm2, [esp+8])
  314. #else
  315. AS2( movd mm2, [esp+4])
  316. #endif
  317. AS2( paddq mm1, mm2)
  318. AS2( paddq mm0, mm1)
  319. AS2( movd a1, mm0)
  320. AS2( psrlq mm0, 32)
  321. AS2( paddq mm5, k2)
  322. AS2( paddq mm0, mm5)
  323. AS2( movq a2, mm0)
  324. AS2( xor edx, edx)
  325. ASJ( jmp, 3, f)
  326. ASL(2)
  327. AS2( movd mm0, a3)
  328. AS2( movq mm4, mm0)
  329. AS2( pmuludq mm0, k3) // a3*k3
  330. AS2( movd mm1, a0)
  331. AS2( pmuludq mm1, k2) // a0*k2
  332. AS2( movd mm2, a1)
  333. AS2( movd mm6, k1)
  334. AS2( pmuludq mm2, mm6) // a1*k1
  335. AS2( movd mm3, a2)
  336. AS2( psllq mm0, 1)
  337. AS2( paddq mm0, mm5)
  338. AS2( movq mm5, mm3)
  339. AS2( movd mm7, k0)
  340. AS2( pmuludq mm3, mm7) // a2*k0
  341. AS2( pmuludq mm4, mm7) // a3*k0
  342. AS2( pmuludq mm5, mm6) // a2*k1
  343. AS2( paddq mm0, mm1)
  344. AS2( movd mm1, a1)
  345. AS2( paddq mm4, mm5)
  346. AS2( movq mm5, mm1)
  347. AS2( pmuludq mm1, k2) // a1*k2
  348. AS2( paddq mm0, mm2)
  349. AS2( movd mm2, a0)
  350. AS2( paddq mm0, mm3)
  351. AS2( movq mm3, mm2)
  352. AS2( pmuludq mm2, k3) // a0*k3
  353. AS2( pmuludq mm3, mm7) // a0*k0
  354. #if CRYPTOPP_BOOL_X32
  355. AS2( movd [esp+16], mm0)
  356. #else
  357. AS2( movd [esp+8], mm0)
  358. #endif
  359. AS2( psrlq mm0, 32)
  360. AS2( pmuludq mm7, mm5) // a1*k0
  361. AS2( pmuludq mm5, k3) // a1*k3
  362. AS2( paddq mm0, mm1)
  363. AS2( movd mm1, a2)
  364. AS2( pmuludq mm1, k2) // a2*k2
  365. AS2( paddq mm0, mm2)
  366. AS2( paddq mm0, mm4)
  367. AS2( movq mm4, mm0)
  368. AS2( movd mm2, a3)
  369. AS2( pmuludq mm2, mm6) // a3*k1
  370. AS2( pmuludq mm6, a0) // a0*k1
  371. AS2( psrlq mm0, 31)
  372. AS2( paddq mm0, mm3)
  373. AS2( movd mm3, [esp])
  374. AS2( paddq mm0, mm3)
  375. AS2( movd mm3, a2)
  376. AS2( pmuludq mm3, k3) // a2*k3
  377. AS2( paddq mm5, mm1)
  378. AS2( movd mm1, a3)
  379. AS2( pmuludq mm1, k2) // a3*k2
  380. AS2( paddq mm5, mm2)
  381. #if CRYPTOPP_BOOL_X32
  382. AS2( movd mm2, [esp+8])
  383. #else
  384. AS2( movd mm2, [esp+4])
  385. #endif
  386. AS2( psllq mm5, 1)
  387. AS2( paddq mm0, mm5)
  388. AS2( psllq mm4, 33)
  389. AS2( movd a0, mm0)
  390. AS2( psrlq mm0, 32)
  391. AS2( paddq mm6, mm7)
  392. #if CRYPTOPP_BOOL_X32
  393. AS2( movd mm7, [esp+16])
  394. #else
  395. AS2( movd mm7, [esp+8])
  396. #endif
  397. AS2( paddq mm0, mm6)
  398. AS2( paddq mm0, mm2)
  399. AS2( paddq mm3, mm1)
  400. AS2( psllq mm3, 1)
  401. AS2( paddq mm0, mm3)
  402. AS2( psrlq mm4, 1)
  403. AS2( movd a1, mm0)
  404. AS2( psrlq mm0, 32)
  405. AS2( por mm4, mm7)
  406. AS2( paddq mm0, mm4)
  407. AS2( movq a2, mm0)
  408. #undef a0
  409. #undef a1
  410. #undef a2
  411. #undef a3
  412. #undef k0
  413. #undef k1
  414. #undef k2
  415. #undef k3
  416. ASL(3)
  417. AS2( test ecx, ecx)
  418. ASJ( jnz, 4, b)
  419. #if CRYPTOPP_BOOL_X32
  420. AS2( add esp, 24)
  421. #else
  422. AS2( add esp, 12)
  423. #endif
  424. AS_POP_IF86( bp)
  425. AS1( emms)
  426. #ifdef __GNUC__
  427. ATT_PREFIX
  428. AS2( mov %0, %%ebx)
  429. : "=m" (temp)
  430. : "m" (L1KeyLength), "c" (blocksRemainingInWord64), "S" (data), "D" (nhK+tagPart*2), "d" (m_isFirstBlock), "a" (polyS+tagPart*4)
  431. : "memory", "cc"
  432. );
  433. #endif
  434. }
  435. #endif
  436. #if VMAC_BOOL_WORD128
  437. #define DeclareNH(a) word128 a=0
  438. #define MUL64(rh,rl,i1,i2) {word128 p = word128(i1)*(i2); rh = word64(p>>64); rl = word64(p);}
  439. #define AccumulateNH(a, b, c) a += word128(b)*(c)
  440. #define Multiply128(r, i1, i2) r = word128(word64(i1)) * word64(i2)
  441. #else
  442. #if _MSC_VER >= 1400 && !defined(__INTEL_COMPILER)
  443. #define MUL32(a, b) __emulu(word32(a), word32(b))
  444. #else
  445. #define MUL32(a, b) ((word64)((word32)(a)) * (word32)(b))
  446. #endif
  447. #if defined(CRYPTOPP_X64_ASM_AVAILABLE)
  448. #define DeclareNH(a) word64 a##0=0, a##1=0
  449. #define MUL64(rh,rl,i1,i2) asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "g"(i2) : "cc");
  450. #define AccumulateNH(a, b, c) asm ("mulq %3; addq %%rax, %0; adcq %%rdx, %1" : "+r"(a##0), "+r"(a##1) : "a"(b), "g"(c) : "%rdx", "cc");
  451. #define ADD128(rh,rl,ih,il) asm ("addq %3, %1; adcq %2, %0" : "+r"(rh),"+r"(rl) : "r"(ih),"r"(il) : "cc");
  452. #elif defined(_MSC_VER) && !CRYPTOPP_BOOL_SLOW_WORD64
  453. #define DeclareNH(a) word64 a##0=0, a##1=0
  454. #define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh));
  455. #define AccumulateNH(a, b, c) {\
  456. word64 ph, pl;\
  457. pl = _umul128(b,c,&ph);\
  458. a##0 += pl;\
  459. a##1 += ph + (a##0 < pl);}
  460. #else
  461. #define VMAC_BOOL_32BIT 1
  462. #define DeclareNH(a) word64 a##0=0, a##1=0, a##2=0
  463. #define MUL64(rh,rl,i1,i2) \
  464. { word64 _i1 = (i1), _i2 = (i2); \
  465. word64 m1= MUL32(_i1,_i2>>32); \
  466. word64 m2= MUL32(_i1>>32,_i2); \
  467. rh = MUL32(_i1>>32,_i2>>32); \
  468. rl = MUL32(_i1,_i2); \
  469. ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \
  470. ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \
  471. }
  472. #define AccumulateNH(a, b, c) {\
  473. word64 p = MUL32(b, c);\
  474. a##1 += word32((p)>>32);\
  475. a##0 += word32(p);\
  476. p = MUL32((b)>>32, c);\
  477. a##2 += word32((p)>>32);\
  478. a##1 += word32(p);\
  479. p = MUL32((b)>>32, (c)>>32);\
  480. a##2 += p;\
  481. p = MUL32(b, (c)>>32);\
  482. a##1 += word32(p);\
  483. a##2 += word32(p>>32);}
  484. #endif
  485. #endif
  486. #ifndef VMAC_BOOL_32BIT
  487. #define VMAC_BOOL_32BIT 0
  488. #endif
  489. #ifndef ADD128
  490. #define ADD128(rh,rl,ih,il) \
  491. { word64 _il = (il); \
  492. (rl) += (_il); \
  493. (rh) += (ih) + ((rl) < (_il)); \
  494. }
  495. #endif
  496. #if !(defined(_MSC_VER) && _MSC_VER < 1300)
  497. template <bool T_128BitTag>
  498. #endif
  499. void VMAC_Base::VHASH_Update_Template(const word64 *data, size_t blocksRemainingInWord64)
  500. {
  501. #define INNER_LOOP_ITERATION(j) {\
  502. word64 d0 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+0]);\
  503. word64 d1 = ConditionalByteReverse(LITTLE_ENDIAN_ORDER, data[i+2*j+1]);\
  504. AccumulateNH(nhA, d0+nhK[i+2*j+0], d1+nhK[i+2*j+1]);\
  505. if (T_128BitTag)\
  506. AccumulateNH(nhB, d0+nhK[i+2*j+2], d1+nhK[i+2*j+3]);\
  507. }
  508. #if (defined(_MSC_VER) && _MSC_VER < 1300)
  509. bool T_128BitTag = m_is128;
  510. #endif
  511. size_t L1KeyLengthInWord64 = m_L1KeyLength / 8;
  512. size_t innerLoopEnd = L1KeyLengthInWord64;
  513. const word64 *nhK = m_nhKey();
  514. word64 *polyS = m_polyState();
  515. bool isFirstBlock = true;
  516. size_t i;
  517. #if !VMAC_BOOL_32BIT
  518. #if VMAC_BOOL_WORD128
  519. word128 a1=0, a2=0;
  520. #else
  521. word64 ah1=0, al1=0, ah2=0, al2=0;
  522. #endif
  523. word64 kh1, kl1, kh2, kl2;
  524. kh1=(polyS+0*4+2)[0]; kl1=(polyS+0*4+2)[1];
  525. if (T_128BitTag)
  526. {
  527. kh2=(polyS+1*4+2)[0]; kl2=(polyS+1*4+2)[1];
  528. }
  529. #endif
  530. do
  531. {
  532. DeclareNH(nhA);
  533. DeclareNH(nhB);
  534. i = 0;
  535. if (blocksRemainingInWord64 < L1KeyLengthInWord64)
  536. {
  537. if (blocksRemainingInWord64 % 8)
  538. {
  539. innerLoopEnd = blocksRemainingInWord64 % 8;
  540. for (; i<innerLoopEnd; i+=2)
  541. INNER_LOOP_ITERATION(0);
  542. }
  543. innerLoopEnd = blocksRemainingInWord64;
  544. }
  545. for (; i<innerLoopEnd; i+=8)
  546. {
  547. INNER_LOOP_ITERATION(0);
  548. INNER_LOOP_ITERATION(1);
  549. INNER_LOOP_ITERATION(2);
  550. INNER_LOOP_ITERATION(3);
  551. }
  552. blocksRemainingInWord64 -= innerLoopEnd;
  553. data += innerLoopEnd;
  554. #if VMAC_BOOL_32BIT
  555. word32 nh0[2], nh1[2];
  556. word64 nh2[2];
  557. nh0[0] = word32(nhA0);
  558. nhA1 += (nhA0 >> 32);
  559. nh1[0] = word32(nhA1);
  560. nh2[0] = (nhA2 + (nhA1 >> 32)) & m62;
  561. if (T_128BitTag)
  562. {
  563. nh0[1] = word32(nhB0);
  564. nhB1 += (nhB0 >> 32);
  565. nh1[1] = word32(nhB1);
  566. nh2[1] = (nhB2 + (nhB1 >> 32)) & m62;
  567. }
  568. #define a0 (((word32 *)(polyS+i*4))[2+NativeByteOrder::ToEnum()])
  569. #define a1 (*(((word32 *)(polyS+i*4))+3-NativeByteOrder::ToEnum())) // workaround for GCC 3.2
  570. #define a2 (((word32 *)(polyS+i*4))[0+NativeByteOrder::ToEnum()])
  571. #define a3 (*(((word32 *)(polyS+i*4))+1-NativeByteOrder::ToEnum()))
  572. #define aHi ((polyS+i*4)[0])
  573. #define k0 (((word32 *)(polyS+i*4+2))[2+NativeByteOrder::ToEnum()])
  574. #define k1 (*(((word32 *)(polyS+i*4+2))+3-NativeByteOrder::ToEnum()))
  575. #define k2 (((word32 *)(polyS+i*4+2))[0+NativeByteOrder::ToEnum()])
  576. #define k3 (*(((word32 *)(polyS+i*4+2))+1-NativeByteOrder::ToEnum()))
  577. #define kHi ((polyS+i*4+2)[0])
  578. if (isFirstBlock)
  579. {
  580. isFirstBlock = false;
  581. if (m_isFirstBlock)
  582. {
  583. m_isFirstBlock = false;
  584. for (i=0; i<=(size_t)T_128BitTag; i++)
  585. {
  586. word64 t = (word64)nh0[i] + k0;
  587. a0 = (word32)t;
  588. t = (t >> 32) + nh1[i] + k1;
  589. a1 = (word32)t;
  590. aHi = (t >> 32) + nh2[i] + kHi;
  591. }
  592. continue;
  593. }
  594. }
  595. for (i=0; i<=(size_t)T_128BitTag; i++)
  596. {
  597. word64 p, t;
  598. word32 t2;
  599. p = MUL32(a3, 2*k3);
  600. p += nh2[i];
  601. p += MUL32(a0, k2);
  602. p += MUL32(a1, k1);
  603. p += MUL32(a2, k0);
  604. t2 = (word32)p;
  605. p >>= 32;
  606. p += MUL32(a0, k3);
  607. p += MUL32(a1, k2);
  608. p += MUL32(a2, k1);
  609. p += MUL32(a3, k0);
  610. t = (word64(word32(p) & 0x7fffffff) << 32) | t2;
  611. p >>= 31;
  612. p += nh0[i];
  613. p += MUL32(a0, k0);
  614. p += MUL32(a1, 2*k3);
  615. p += MUL32(a2, 2*k2);
  616. p += MUL32(a3, 2*k1);
  617. t2 = (word32)p;
  618. p >>= 32;
  619. p += nh1[i];
  620. p += MUL32(a0, k1);
  621. p += MUL32(a1, k0);
  622. p += MUL32(a2, 2*k3);
  623. p += MUL32(a3, 2*k2);
  624. a0 = t2;
  625. a1 = (word32)p;
  626. aHi = (p >> 32) + t;
  627. }
  628. #undef a0
  629. #undef a1
  630. #undef a2
  631. #undef a3
  632. #undef aHi
  633. #undef k0
  634. #undef k1
  635. #undef k2
  636. #undef k3
  637. #undef kHi
  638. #else // #if VMAC_BOOL_32BIT
  639. if (isFirstBlock)
  640. {
  641. isFirstBlock = false;
  642. if (m_isFirstBlock)
  643. {
  644. m_isFirstBlock = false;
  645. #if VMAC_BOOL_WORD128
  646. #define first_poly_step(a, kh, kl, m) a = (m & m126) + ((word128(kh) << 64) | kl)
  647. first_poly_step(a1, kh1, kl1, nhA);
  648. if (T_128BitTag)
  649. first_poly_step(a2, kh2, kl2, nhB);
  650. #else
  651. #define first_poly_step(ah, al, kh, kl, mh, ml) {\
  652. mh &= m62;\
  653. ADD128(mh, ml, kh, kl); \
  654. ah = mh; al = ml;}
  655. first_poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
  656. if (T_128BitTag)
  657. first_poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
  658. #endif
  659. continue;
  660. }
  661. else
  662. {
  663. #if VMAC_BOOL_WORD128
  664. a1 = (word128((polyS+0*4)[0]) << 64) | (polyS+0*4)[1];
  665. #else
  666. ah1=(polyS+0*4)[0]; al1=(polyS+0*4)[1];
  667. #endif
  668. if (T_128BitTag)
  669. {
  670. #if VMAC_BOOL_WORD128
  671. a2 = (word128((polyS+1*4)[0]) << 64) | (polyS+1*4)[1];
  672. #else
  673. ah2=(polyS+1*4)[0]; al2=(polyS+1*4)[1];
  674. #endif
  675. }
  676. }
  677. }
  678. #if VMAC_BOOL_WORD128
  679. #define poly_step(a, kh, kl, m) \
  680. { word128 t1, t2, t3, t4;\
  681. Multiply128(t2, a>>64, kl);\
  682. Multiply128(t3, a, kh);\
  683. Multiply128(t1, a, kl);\
  684. Multiply128(t4, a>>64, 2*kh);\
  685. t2 += t3;\
  686. t4 += t1;\
  687. t2 += t4>>64;\
  688. a = (word128(word64(t2)&m63) << 64) | word64(t4);\
  689. t2 *= 2;\
  690. a += m & m126;\
  691. a += t2>>64;}
  692. poly_step(a1, kh1, kl1, nhA);
  693. if (T_128BitTag)
  694. poly_step(a2, kh2, kl2, nhB);
  695. #else
  696. #define poly_step(ah, al, kh, kl, mh, ml) \
  697. { word64 t1h, t1l, t2h, t2l, t3h, t3l, z=0; \
  698. /* compute ab*cd, put bd into result registers */ \
  699. MUL64(t2h,t2l,ah,kl); \
  700. MUL64(t3h,t3l,al,kh); \
  701. MUL64(t1h,t1l,ah,2*kh); \
  702. MUL64(ah,al,al,kl); \
  703. /* add together ad + bc */ \
  704. ADD128(t2h,t2l,t3h,t3l); \
  705. /* add 2 * ac to result */ \
  706. ADD128(ah,al,t1h,t1l); \
  707. /* now (ah,al), (t2l,2*t2h) need summing */ \
  708. /* first add the high registers, carrying into t2h */ \
  709. ADD128(t2h,ah,z,t2l); \
  710. /* double t2h and add top bit of ah */ \
  711. t2h += t2h + (ah >> 63); \
  712. ah &= m63; \
  713. /* now add the low registers */ \
  714. mh &= m62; \
  715. ADD128(ah,al,mh,ml); \
  716. ADD128(ah,al,z,t2h); \
  717. }
  718. poly_step(ah1, al1, kh1, kl1, nhA1, nhA0);
  719. if (T_128BitTag)
  720. poly_step(ah2, al2, kh2, kl2, nhB1, nhB0);
  721. #endif
  722. #endif // #if VMAC_BOOL_32BIT
  723. } while (blocksRemainingInWord64);
  724. #if VMAC_BOOL_WORD128
  725. (polyS+0*4)[0]=word64(a1>>64); (polyS+0*4)[1]=word64(a1);
  726. if (T_128BitTag)
  727. {
  728. (polyS+1*4)[0]=word64(a2>>64); (polyS+1*4)[1]=word64(a2);
  729. }
  730. #elif !VMAC_BOOL_32BIT
  731. (polyS+0*4)[0]=ah1; (polyS+0*4)[1]=al1;
  732. if (T_128BitTag)
  733. {
  734. (polyS+1*4)[0]=ah2; (polyS+1*4)[1]=al2;
  735. }
  736. #endif
  737. }
  738. inline void VMAC_Base::VHASH_Update(const word64 *data, size_t blocksRemainingInWord64)
  739. {
  740. #if (CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE && (CRYPTOPP_BOOL_X86 || (CRYPTOPP_BOOL_X32 && !defined(CRYPTOPP_DISABLE_VMAC_ASM))))
  741. if (HasSSE2())
  742. {
  743. VHASH_Update_SSE2(data, blocksRemainingInWord64, 0);
  744. if (m_is128)
  745. VHASH_Update_SSE2(data, blocksRemainingInWord64, 1);
  746. m_isFirstBlock = false;
  747. }
  748. else
  749. #endif
  750. {
  751. #if defined(_MSC_VER) && _MSC_VER < 1300
  752. VHASH_Update_Template(data, blocksRemainingInWord64);
  753. #else
  754. if (m_is128)
  755. VHASH_Update_Template<true>(data, blocksRemainingInWord64);
  756. else
  757. VHASH_Update_Template<false>(data, blocksRemainingInWord64);
  758. #endif
  759. }
  760. }
  761. size_t VMAC_Base::HashMultipleBlocks(const word64 *data, size_t length)
  762. {
  763. size_t remaining = ModPowerOf2(length, m_L1KeyLength);
  764. VHASH_Update(data, (length-remaining)/8);
  765. return remaining;
  766. }
  767. static word64 L3Hash(const word64 *input, const word64 *l3Key, size_t len)
  768. {
  769. word64 rh, rl, t, z=0;
  770. word64 p1 = input[0], p2 = input[1];
  771. word64 k1 = l3Key[0], k2 = l3Key[1];
  772. /* fully reduce (p1,p2)+(len,0) mod p127 */
  773. t = p1 >> 63;
  774. p1 &= m63;
  775. ADD128(p1, p2, len, t);
  776. /* At this point, (p1,p2) is at most 2^127+(len<<64) */
  777. t = (p1 > m63) + ((p1 == m63) & (p2 == m64));
  778. ADD128(p1, p2, z, t);
  779. p1 &= m63;
  780. /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
  781. t = p1 + (p2 >> 32);
  782. t += (t >> 32);
  783. t += (word32)t > 0xfffffffeU;
  784. p1 += (t >> 32);
  785. p2 += (p1 << 32);
  786. /* compute (p1+k1)%p64 and (p2+k2)%p64 */
  787. p1 += k1;
  788. p1 += (0 - (p1 < k1)) & 257;
  789. p2 += k2;
  790. p2 += (0 - (p2 < k2)) & 257;
  791. /* compute (p1+k1)*(p2+k2)%p64 */
  792. MUL64(rh, rl, p1, p2);
  793. t = rh >> 56;
  794. ADD128(t, rl, z, rh);
  795. rh <<= 8;
  796. ADD128(t, rl, z, rh);
  797. t += t << 8;
  798. rl += t;
  799. rl += (0 - (rl < t)) & 257;
  800. rl += (0 - (rl > p64-1)) & 257;
  801. return rl;
  802. }
  803. void VMAC_Base::TruncatedFinal(byte *mac, size_t size)
  804. {
  805. size_t len = ModPowerOf2(GetBitCountLo()/8, m_L1KeyLength);
  806. if (len)
  807. {
  808. memset(m_data()+len, 0, (0-len)%16);
  809. VHASH_Update(DataBuf(), ((len+15)/16)*2);
  810. len *= 8; // convert to bits
  811. }
  812. else if (m_isFirstBlock)
  813. {
  814. // special case for empty string
  815. m_polyState()[0] = m_polyState()[2];
  816. m_polyState()[1] = m_polyState()[3];
  817. if (m_is128)
  818. {
  819. m_polyState()[4] = m_polyState()[6];
  820. m_polyState()[5] = m_polyState()[7];
  821. }
  822. }
  823. if (m_is128)
  824. {
  825. word64 t[2];
  826. t[0] = L3Hash(m_polyState(), m_l3Key(), len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad());
  827. t[1] = L3Hash(m_polyState()+4, m_l3Key()+2, len) + GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad()+8);
  828. if (size == 16)
  829. {
  830. PutWord(false, BIG_ENDIAN_ORDER, mac, t[0]);
  831. PutWord(false, BIG_ENDIAN_ORDER, mac+8, t[1]);
  832. }
  833. else
  834. {
  835. t[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[0]);
  836. t[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, t[1]);
  837. memcpy(mac, t, size);
  838. }
  839. }
  840. else
  841. {
  842. word64 t = L3Hash(m_polyState(), m_l3Key(), len);
  843. t += GetWord<word64>(true, BIG_ENDIAN_ORDER, m_pad() + (m_nonce()[IVSize()-1]&1) * 8);
  844. if (size == 8)
  845. PutWord(false, BIG_ENDIAN_ORDER, mac, t);
  846. else
  847. {
  848. t = ConditionalByteReverse(BIG_ENDIAN_ORDER, t);
  849. memcpy(mac, &t, size);
  850. }
  851. }
  852. }
  853. NAMESPACE_END