Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

149 lines
4.0 KiB

  1. // =================================================================================
  2. // Internet Character Set Detection: For Japanese
  3. // =================================================================================
  4. #include "private.h"
  5. #include "detcbase.h"
  6. #include "detcjpn.h"
  7. #include "fechrcnv.h"
  8. #include "codepage.h"
  9. CIncdJapanese::CIncdJapanese(DWORD nCp)
  10. {
  11. m_nScoreJis = 0;
  12. m_nScoreEuc = 0;
  13. m_nScoreSJis = 0;
  14. m_nISOMode = NONE ;
  15. m_nJISMode = REGULAR;
  16. m_nEucMode = REGULAR;
  17. m_fDoubleByteSJis = FALSE;
  18. // If Jpn autoselect, we'll bias to Shift-Jis like we did before
  19. m_nPreferredCp = (nCp == CP_JP_AUTO)? CP_JPN_SJ : nCp;
  20. }
  21. BOOL CIncdJapanese::CheckISOChar(UCHAR tc)
  22. {
  23. switch (m_nISOMode) {
  24. case NONE:
  25. if ( tc == ESC )
  26. m_nISOMode = ISO_ESC ;
  27. break;
  28. case ISO_ESC:
  29. if ( tc == ISO2022_IN_CHAR ) // '$'
  30. m_nISOMode = ISO_ESC_IN ;
  31. else if ( tc == ISO2022_OUT_CHAR )
  32. m_nISOMode = ISO_ESC_OUT ; // '('
  33. else
  34. m_nISOMode = NONE ;
  35. break;
  36. case ISO_ESC_IN: // esc '$'
  37. m_nISOMode = NONE ;
  38. if ( tc == ISO2022_IN_JP_CHAR1 || // 'B'
  39. tc == ISO2022_IN_JP_CHAR2 ) // '@'
  40. {
  41. m_nJISMode = DOUBLEBYTE ;
  42. return TRUE ;
  43. }
  44. break;
  45. case ISO_ESC_OUT: // esc '('
  46. m_nISOMode = NONE ;
  47. if ( tc == ISO2022_OUT_JP_CHAR1 || // 'B'
  48. tc == ISO2022_OUT_JP_CHAR2 ) // 'J'
  49. {
  50. m_nJISMode = REGULAR ;
  51. return TRUE ;
  52. }
  53. else if ( tc == ISO2022_OUT_JP_CHAR3 ) // 'I'
  54. {
  55. m_nJISMode = KATAKANA;
  56. return TRUE ;
  57. }
  58. break;
  59. }
  60. return FALSE;
  61. }
  62. BOOL CIncdJapanese::DetectChar(UCHAR tc)
  63. {
  64. // JIS
  65. if ( CheckISOChar(tc) )
  66. return FALSE; // JIS mode change, don't need to check other type
  67. switch (m_nJISMode) {
  68. case REGULAR:
  69. if (tc < 0x80)
  70. m_nScoreJis += SCORE_MAJOR;
  71. break;
  72. case DOUBLEBYTE:
  73. case KATAKANA:
  74. m_nScoreJis += SCORE_MAJOR;
  75. return FALSE; // In JIS mode for sure, don't need to check other type
  76. }
  77. // EUC-J
  78. switch (m_nEucMode) {
  79. case REGULAR:
  80. if (tc >= 0xa1 && tc <= 0xfe) // Double Byte
  81. m_nEucMode = DOUBLEBYTE;
  82. else if (tc == 0x8e) // Single Byte Katakana
  83. m_nEucMode = KATAKANA;
  84. else if (tc < 0x80)
  85. m_nScoreEuc += SCORE_MAJOR;
  86. break;
  87. case DOUBLEBYTE:
  88. if (tc >= 0xa1 && tc <= 0xfe)
  89. m_nScoreEuc += SCORE_MAJOR * 2;
  90. m_nEucMode = REGULAR;
  91. break;
  92. case KATAKANA:
  93. if (tc >= 0xa1 && tc <= 0xdf) // Katakana range
  94. m_nScoreEuc += SCORE_MAJOR * 2;
  95. m_nEucMode = REGULAR;
  96. break;
  97. }
  98. // Shift-JIS
  99. if (!m_fDoubleByteSJis) {
  100. if ((tc >= 0x81 && tc <= 0x9f) || (tc >= 0xe0 && tc <= 0xfc)) // Double Byte
  101. m_fDoubleByteSJis = TRUE;
  102. else if (tc <= 0x7e || (tc >= 0xa1 && tc <= 0xdf))
  103. m_nScoreSJis += SCORE_MAJOR;
  104. } else {
  105. if (tc >= 0x40 && tc <= 0xfc && tc != 0x7f) // Trail Byte range
  106. m_nScoreSJis += SCORE_MAJOR * 2;
  107. m_fDoubleByteSJis = FALSE;
  108. }
  109. return FALSE;
  110. }
  111. int CIncdJapanese::GetDetectedCodeSet()
  112. {
  113. int nMaxScore = m_nScoreSJis;
  114. int nCodeSet = CP_JPN_SJ;
  115. if (m_nScoreEuc > nMaxScore) {
  116. nMaxScore = m_nScoreEuc;
  117. nCodeSet = CP_EUC_JP ; // EUC
  118. } else if (m_nScoreEuc == nMaxScore) {
  119. if (m_nScoreEuc > MIN_JPN_DETECTLEN * SCORE_MAJOR)
  120. // If the given string is not long enough, we should rather choose SJIS
  121. // This helps fix the bug when we are just given Window Title
  122. // at Shell HyperText view.
  123. nCodeSet = CP_EUC_JP ; // EUC
  124. else
  125. // If we can't distinguish between EUC and Shift-Jis, we use the preferred one
  126. nCodeSet = m_nPreferredCp;
  127. }
  128. // JIS
  129. if (m_nScoreJis > nMaxScore)
  130. nCodeSet = CP_ISO_2022_JP ;
  131. // Even score means all 7bits chars
  132. // in this case, it maybe just pure ANSI data, we return it is ambiguous.
  133. else if (m_nScoreJis == nMaxScore)
  134. nCodeSet = 0 ;
  135. return nCodeSet;
  136. }