Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

193 lines
6.9 KiB

  1. // CaseMap.cpp -- Unicode case mapping routines for locale 0x0409
  2. #include "stdafx.h"
  3. /*
  4. The data constants and code below implement case mapping for the US English
  5. locale. The primary goal here was to make the size of code plus data as small
  6. as possible while keeping reasonable run-time speed.
  7. The easiest implmentation would have been to simply define a 65,536 entry
  8. table of USHORT values. Then the case mapping would be just an array indexing
  9. operation. However the data size to upper and lower case mapping would be
  10. 256 K bytes. That clearly unacceptable -- especially since there are only 736
  11. letter characters in the Unicode set.
  12. The next approach is to make a short list of the letter positions together
  13. with the corresponding case-mapped positions. That gives us a data size of
  14. 5,888 bytes (8 x 736). Then we also need code to binary-search the list of
  15. letter positions to see if a particular code point has a case mapping and,
  16. if so, to locate the corresponding mapped value.
  17. This is good, but we can do better by noticing that the case mapping is not
  18. random. Quite often a continguous block of Unicode code points map to positions
  19. with the same relative offset. In the Ascii section, for example the 26 lower
  20. case letters all map down by 0x20 positions, while the 26 upper case letters
  21. map up by 0x20 positions. In other areas of Unicode we find that quite often
  22. every other position uses the same relative offset.
  23. That observation together with some simple methods to pack information
  24. efficiently leads to the current implementation which uses 840 bytes of data
  25. and a look-up algorithm which is just a little bit more complicated than a
  26. binary search.
  27. We could probably make the data smaller still by using a more complicated
  28. data structure and a more complicated algorithm, but it isn't clear that the
  29. additional effort would be worthwhile. That is, the additional code space may
  30. well be larger than the data-space savings.
  31. */
  32. #include "CaseTab.h"
  33. // The code below does case mapping using a binary search to find the appropriate
  34. // code map block and then applying that block. For the case insensitive string
  35. // comparison, we keep the most recently used block around so that we can avoid
  36. // the binary search in many cases.
  37. static CodeMapBlock Find_0x0409_Map_Block(WCHAR wc,
  38. const CodeMapBlock *pCMB,
  39. UINT cBlocks
  40. )
  41. {
  42. UINT iBlockLow = 0;
  43. if (wc < pCMB[iBlockLow].iwcFirst)
  44. return UCMB_NULL.cmb; // Map block with zero entries based at zero offset.
  45. UINT iBlockHigh = cBlocks;
  46. for (;;)
  47. {
  48. UINT iBlockMid = (iBlockLow + iBlockHigh) >> 1;
  49. CodeMapBlock mblk = pCMB[iBlockMid];
  50. if (iBlockMid == iBlockLow)
  51. return mblk; // iBlockHigh must have been iBlockLow + 1.
  52. if (wc >= mblk.iwcFirst)
  53. iBlockLow = iBlockMid;
  54. else iBlockHigh = iBlockMid;
  55. }
  56. }
  57. static WCHAR Map_from_0x0409_Block(WCHAR wc,
  58. CodeMapBlock mblk,
  59. const short *paiDeltaValues
  60. )
  61. {
  62. UINT iBaseNew = mblk.iwcFirst;
  63. if (wc >= iBaseNew + mblk.cwcSpan)
  64. return wc;
  65. if (mblk.fGapIs2 && UINT(wc & 1) != (iBaseNew & 1))
  66. return wc;
  67. return wc + paiDeltaValues[mblk.iDelta];
  68. }
  69. static WCHAR Map_to_0x0409_Case(WCHAR wc,
  70. const short *paiDeltaValues,
  71. const CodeMapBlock *pCMB,
  72. UINT cBlocks
  73. )
  74. {
  75. return Map_from_0x0409_Block
  76. (wc,
  77. Find_0x0409_Map_Block(wc, pCMB, cBlocks),
  78. paiDeltaValues
  79. );
  80. }
  81. static WCHAR Map_to_0x0409_Lower_with_History(WCHAR wc,
  82. CodeMapBlock &mblkLower,
  83. CodeMapBlock &mblkUpper
  84. )
  85. {
  86. // This routine does a lower case mapping optimized for text which is mostly
  87. // letters. It also looks for characters which commonly occur in file and
  88. // stream paths.
  89. //
  90. // The main trick here is to keep track of the last letter mapping we used
  91. // because it is probably still valid. If it isn't we adjust the mappings
  92. // to match the kind of letter character we're processing.
  93. if ( wc < L'A' // Below the first letter?
  94. || wc == L'\\' // Path separator?
  95. )
  96. return wc;
  97. if (wc >= mblkLower.iwcFirst && wc < mblkLower.iwcFirst + mblkLower.cwcSpan)
  98. return Map_from_0x0409_Block(wc, mblkLower, aiDeltaValues_Lower);
  99. if (wc >= mblkUpper.iwcFirst && wc < mblkUpper.iwcFirst + mblkUpper.cwcSpan)
  100. {
  101. if (wc != Map_from_0x0409_Block(wc, mblkUpper, aiDeltaValues_Upper))
  102. return wc; // WC was a lower case letter already!
  103. }
  104. CodeMapBlock mblkLC = Find_0x0409_Map_Block
  105. (wc,
  106. &(UCMB_Lower->cmb),
  107. sizeof(UCMB_Lower) / sizeof(UCodeMapBlock)
  108. );
  109. CodeMapBlock mblkUC = Find_0x0409_Map_Block
  110. (wc,
  111. &(UCMB_Upper->cmb),
  112. sizeof(UCMB_Upper) / sizeof(UCodeMapBlock)
  113. );
  114. WCHAR wcLC = Map_from_0x0409_Block(wc, mblkLC, aiDeltaValues_Lower);
  115. WCHAR wcUC = Map_from_0x0409_Block(wc, mblkUC, aiDeltaValues_Upper);
  116. if (wcLC != wc || wcUC != wc) // Was wc a letter?
  117. {
  118. mblkLower = mblkLC;
  119. mblkUpper = mblkUC;
  120. }
  121. return wcLC;
  122. }
  123. INT wcsicmp_0x0409(const WCHAR * pwcLeft, const WCHAR *pwcRight)
  124. {
  125. CodeMapBlock mblkUC = UCMB_NULL.cmb;
  126. CodeMapBlock mblkLC = UCMB_NULL.cmb;
  127. const WCHAR *pwcLeftBase = pwcLeft;
  128. const WCHAR *pwcRightBase = pwcRight;
  129. // The code below returns zero when the two strings differ only by case.
  130. // Otherwise the value it returns will order strings by their Unicode character
  131. // values. This is important for later path manager implementations which use
  132. // Trie structures.
  133. for (;;)
  134. {
  135. WCHAR wcLeft = Map_to_0x0409_Lower_with_History(*pwcLeft ++, mblkLC, mblkUC);
  136. WCHAR wcRight = Map_to_0x0409_Lower_with_History(*pwcRight++, mblkLC, mblkUC);
  137. INT diff= wcLeft - wcRight;
  138. if (diff || !wcLeft)
  139. return diff;
  140. }
  141. }
  142. WCHAR WC_To_0x0409_Upper(WCHAR wc)
  143. {
  144. return Map_to_0x0409_Case(wc, (const short*) &aiDeltaValues_Upper, &(UCMB_Upper->cmb),
  145. sizeof(UCMB_Upper) / sizeof(UCodeMapBlock)
  146. );
  147. }
  148. WCHAR WC_To_0x0409_Lower(WCHAR wc)
  149. {
  150. return Map_to_0x0409_Case(wc, (const short *) &aiDeltaValues_Lower, &(UCMB_Lower->cmb),
  151. sizeof(UCMB_Lower) / sizeof(UCodeMapBlock)
  152. );
  153. }