nsUnicharUtils.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  5. #include "nsUnicharUtils.h"
  6. #include "nsXPCOMStrings.h"
  7. #include "nsUTF8Utils.h"
  8. #include "nsUnicodeProperties.h"
  9. #include "mozilla/Likely.h"
  10. #include "mozilla/HashFunctions.h"
  11. // We map x -> x, except for upper-case letters,
  12. // which we map to their lower-case equivalents.
  13. static const uint8_t gASCIIToLower [128] = {
  14. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  15. 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  16. 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
  17. 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
  18. 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
  19. 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
  20. 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
  21. 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
  22. };
  23. #define IS_ASCII(u) ((u) < 0x80)
  24. #define IS_ASCII_UPPER(u) (('A' <= (u)) && ((u) <= 'Z'))
  25. #define IS_ASCII_LOWER(u) (('a' <= (u)) && ((u) <= 'z'))
  26. #define IS_ASCII_ALPHA(u) (IS_ASCII_UPPER(u) || IS_ASCII_LOWER(u))
  27. #define IS_ASCII_SPACE(u) (' ' == (u))
  28. // We want ToLowerCase(uint32_t) and ToLowerCaseASCII(uint32_t) to be fast
  29. // when they're called from within the case-insensitive comparators, so we
  30. // define inlined versions.
  31. static MOZ_ALWAYS_INLINE uint32_t
  32. ToLowerCase_inline(uint32_t aChar)
  33. {
  34. if (IS_ASCII(aChar)) {
  35. return gASCIIToLower[aChar];
  36. }
  37. return mozilla::unicode::GetLowercase(aChar);
  38. }
  39. static MOZ_ALWAYS_INLINE uint32_t
  40. ToLowerCaseASCII_inline(const uint32_t aChar)
  41. {
  42. if (IS_ASCII(aChar)) {
  43. return gASCIIToLower[aChar];
  44. }
  45. return aChar;
  46. }
  47. void
  48. ToLowerCase(nsAString& aString)
  49. {
  50. char16_t *buf = aString.BeginWriting();
  51. ToLowerCase(buf, buf, aString.Length());
  52. }
  53. void
  54. ToLowerCase(const nsAString& aSource,
  55. nsAString& aDest)
  56. {
  57. const char16_t *in;
  58. char16_t *out;
  59. uint32_t len = NS_StringGetData(aSource, &in);
  60. NS_StringGetMutableData(aDest, len, &out);
  61. NS_ASSERTION(out, "Uh...");
  62. ToLowerCase(in, out, len);
  63. }
  64. uint32_t
  65. ToLowerCaseASCII(const uint32_t aChar)
  66. {
  67. return ToLowerCaseASCII_inline(aChar);
  68. }
  69. void
  70. ToUpperCase(nsAString& aString)
  71. {
  72. char16_t *buf = aString.BeginWriting();
  73. ToUpperCase(buf, buf, aString.Length());
  74. }
  75. void
  76. ToUpperCase(const nsAString& aSource,
  77. nsAString& aDest)
  78. {
  79. const char16_t *in;
  80. char16_t *out;
  81. uint32_t len = NS_StringGetData(aSource, &in);
  82. NS_StringGetMutableData(aDest, len, &out);
  83. NS_ASSERTION(out, "Uh...");
  84. ToUpperCase(in, out, len);
  85. }
  86. #ifdef MOZILLA_INTERNAL_API
  87. int32_t
  88. nsCaseInsensitiveStringComparator::operator()(const char16_t* lhs,
  89. const char16_t* rhs,
  90. uint32_t lLength,
  91. uint32_t rLength) const
  92. {
  93. return (lLength == rLength) ? CaseInsensitiveCompare(lhs, rhs, lLength) :
  94. (lLength > rLength) ? 1 : -1;
  95. }
  96. int32_t
  97. nsCaseInsensitiveUTF8StringComparator::operator()(const char* lhs,
  98. const char* rhs,
  99. uint32_t lLength,
  100. uint32_t rLength) const
  101. {
  102. return CaseInsensitiveCompare(lhs, rhs, lLength, rLength);
  103. }
  104. int32_t
  105. nsASCIICaseInsensitiveStringComparator::operator()(const char16_t* lhs,
  106. const char16_t* rhs,
  107. uint32_t lLength,
  108. uint32_t rLength) const
  109. {
  110. if (lLength != rLength) {
  111. if (lLength > rLength)
  112. return 1;
  113. return -1;
  114. }
  115. while (rLength) {
  116. // we don't care about surrogates here, because we're only
  117. // lowercasing the ASCII range
  118. char16_t l = *lhs++;
  119. char16_t r = *rhs++;
  120. if (l != r) {
  121. l = ToLowerCaseASCII_inline(l);
  122. r = ToLowerCaseASCII_inline(r);
  123. if (l > r)
  124. return 1;
  125. else if (r > l)
  126. return -1;
  127. }
  128. rLength--;
  129. }
  130. return 0;
  131. }
  132. #endif // MOZILLA_INTERNAL_API
  133. uint32_t
  134. ToLowerCase(uint32_t aChar)
  135. {
  136. return ToLowerCase_inline(aChar);
  137. }
  138. void
  139. ToLowerCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
  140. {
  141. for (uint32_t i = 0; i < aLen; i++) {
  142. uint32_t ch = aIn[i];
  143. if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
  144. NS_IS_LOW_SURROGATE(aIn[i + 1])) {
  145. ch = mozilla::unicode::GetLowercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
  146. NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
  147. aOut[i++] = H_SURROGATE(ch);
  148. aOut[i] = L_SURROGATE(ch);
  149. continue;
  150. }
  151. aOut[i] = ToLowerCase(ch);
  152. }
  153. }
  154. uint32_t
  155. ToUpperCase(uint32_t aChar)
  156. {
  157. if (IS_ASCII(aChar)) {
  158. if (IS_ASCII_LOWER(aChar)) {
  159. return aChar - 0x20;
  160. }
  161. return aChar;
  162. }
  163. return mozilla::unicode::GetUppercase(aChar);
  164. }
  165. void
  166. ToUpperCase(const char16_t *aIn, char16_t *aOut, uint32_t aLen)
  167. {
  168. for (uint32_t i = 0; i < aLen; i++) {
  169. uint32_t ch = aIn[i];
  170. if (NS_IS_HIGH_SURROGATE(ch) && i < aLen - 1 &&
  171. NS_IS_LOW_SURROGATE(aIn[i + 1])) {
  172. ch = mozilla::unicode::GetUppercase(SURROGATE_TO_UCS4(ch, aIn[i + 1]));
  173. NS_ASSERTION(!IS_IN_BMP(ch), "case mapping crossed BMP/SMP boundary!");
  174. aOut[i++] = H_SURROGATE(ch);
  175. aOut[i] = L_SURROGATE(ch);
  176. continue;
  177. }
  178. aOut[i] = ToUpperCase(ch);
  179. }
  180. }
  181. uint32_t
  182. ToTitleCase(uint32_t aChar)
  183. {
  184. if (IS_ASCII(aChar)) {
  185. return ToUpperCase(aChar);
  186. }
  187. return mozilla::unicode::GetTitlecaseForLower(aChar);
  188. }
  189. int32_t
  190. CaseInsensitiveCompare(const char16_t *a,
  191. const char16_t *b,
  192. uint32_t len)
  193. {
  194. NS_ASSERTION(a && b, "Do not pass in invalid pointers!");
  195. if (len) {
  196. do {
  197. uint32_t c1 = *a++;
  198. uint32_t c2 = *b++;
  199. // Unfortunately, we need to check for surrogates BEFORE we check
  200. // for equality, because we could have identical high surrogates
  201. // but non-identical characters, so we can't just skip them
  202. // If c1 isn't a surrogate, we don't bother to check c2;
  203. // in the case where it _is_ a surrogate, we're definitely going to get
  204. // a mismatch, and don't need to interpret and lowercase it
  205. if (NS_IS_HIGH_SURROGATE(c1) && len > 1 && NS_IS_LOW_SURROGATE(*a)) {
  206. c1 = SURROGATE_TO_UCS4(c1, *a++);
  207. if (NS_IS_HIGH_SURROGATE(c2) && NS_IS_LOW_SURROGATE(*b)) {
  208. c2 = SURROGATE_TO_UCS4(c2, *b++);
  209. }
  210. // If c2 wasn't a surrogate, decrementing len means we'd stop
  211. // short of the end of string b, but that doesn't actually matter
  212. // because we're going to find a mismatch and return early
  213. --len;
  214. }
  215. if (c1 != c2) {
  216. c1 = ToLowerCase_inline(c1);
  217. c2 = ToLowerCase_inline(c2);
  218. if (c1 != c2) {
  219. if (c1 < c2) {
  220. return -1;
  221. }
  222. return 1;
  223. }
  224. }
  225. } while (--len != 0);
  226. }
  227. return 0;
  228. }
  229. // Calculates the codepoint of the UTF8 sequence starting at aStr. Sets aNext
  230. // to the byte following the end of the sequence.
  231. //
  232. // If the sequence is invalid, or if computing the codepoint would take us off
  233. // the end of the string (as marked by aEnd), returns -1 and does not set
  234. // aNext. Note that this function doesn't check that aStr < aEnd -- it assumes
  235. // you've done that already.
  236. static MOZ_ALWAYS_INLINE uint32_t
  237. GetLowerUTF8Codepoint(const char* aStr, const char* aEnd, const char **aNext)
  238. {
  239. // Convert to unsigned char so that stuffing chars into PRUint32s doesn't
  240. // sign extend.
  241. const unsigned char *str = (unsigned char*)aStr;
  242. if (UTF8traits::isASCII(str[0])) {
  243. // It's ASCII; just convert to lower-case and return it.
  244. *aNext = aStr + 1;
  245. return gASCIIToLower[*str];
  246. }
  247. if (UTF8traits::is2byte(str[0]) && MOZ_LIKELY(aStr + 1 < aEnd)) {
  248. // It's a two-byte sequence, so it looks like
  249. // 110XXXXX 10XXXXXX.
  250. // This is definitely in the BMP, so we can store straightaway into a
  251. // uint16_t.
  252. uint16_t c;
  253. c = (str[0] & 0x1F) << 6;
  254. c += (str[1] & 0x3F);
  255. // we don't go through ToLowerCase here, because we know this isn't
  256. // an ASCII character so the ASCII fast-path there is useless
  257. c = mozilla::unicode::GetLowercase(c);
  258. *aNext = aStr + 2;
  259. return c;
  260. }
  261. if (UTF8traits::is3byte(str[0]) && MOZ_LIKELY(aStr + 2 < aEnd)) {
  262. // It's a three-byte sequence, so it looks like
  263. // 1110XXXX 10XXXXXX 10XXXXXX.
  264. // This will just barely fit into 16-bits, so store into a uint16_t.
  265. uint16_t c;
  266. c = (str[0] & 0x0F) << 12;
  267. c += (str[1] & 0x3F) << 6;
  268. c += (str[2] & 0x3F);
  269. c = mozilla::unicode::GetLowercase(c);
  270. *aNext = aStr + 3;
  271. return c;
  272. }
  273. if (UTF8traits::is4byte(str[0]) && MOZ_LIKELY(aStr + 3 < aEnd)) {
  274. // It's a four-byte sequence, so it looks like
  275. // 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX.
  276. uint32_t c;
  277. c = (str[0] & 0x07) << 18;
  278. c += (str[1] & 0x3F) << 12;
  279. c += (str[2] & 0x3F) << 6;
  280. c += (str[3] & 0x3F);
  281. c = mozilla::unicode::GetLowercase(c);
  282. *aNext = aStr + 4;
  283. return c;
  284. }
  285. // Hm, we don't understand this sequence.
  286. return -1;
  287. }
  288. int32_t CaseInsensitiveCompare(const char *aLeft,
  289. const char *aRight,
  290. uint32_t aLeftBytes,
  291. uint32_t aRightBytes)
  292. {
  293. const char *leftEnd = aLeft + aLeftBytes;
  294. const char *rightEnd = aRight + aRightBytes;
  295. while (aLeft < leftEnd && aRight < rightEnd) {
  296. uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, leftEnd, &aLeft);
  297. if (MOZ_UNLIKELY(leftChar == uint32_t(-1)))
  298. return -1;
  299. uint32_t rightChar = GetLowerUTF8Codepoint(aRight, rightEnd, &aRight);
  300. if (MOZ_UNLIKELY(rightChar == uint32_t(-1)))
  301. return -1;
  302. // Now leftChar and rightChar are lower-case, so we can compare them.
  303. if (leftChar != rightChar) {
  304. if (leftChar > rightChar)
  305. return 1;
  306. return -1;
  307. }
  308. }
  309. // Make sure that if one string is longer than the other we return the
  310. // correct result.
  311. if (aLeft < leftEnd)
  312. return 1;
  313. if (aRight < rightEnd)
  314. return -1;
  315. return 0;
  316. }
  317. bool
  318. CaseInsensitiveUTF8CharsEqual(const char* aLeft, const char* aRight,
  319. const char* aLeftEnd, const char* aRightEnd,
  320. const char** aLeftNext, const char** aRightNext,
  321. bool* aErr)
  322. {
  323. NS_ASSERTION(aLeftNext, "Out pointer shouldn't be null.");
  324. NS_ASSERTION(aRightNext, "Out pointer shouldn't be null.");
  325. NS_ASSERTION(aErr, "Out pointer shouldn't be null.");
  326. NS_ASSERTION(aLeft < aLeftEnd, "aLeft must be less than aLeftEnd.");
  327. NS_ASSERTION(aRight < aRightEnd, "aRight must be less than aRightEnd.");
  328. uint32_t leftChar = GetLowerUTF8Codepoint(aLeft, aLeftEnd, aLeftNext);
  329. if (MOZ_UNLIKELY(leftChar == uint32_t(-1))) {
  330. *aErr = true;
  331. return false;
  332. }
  333. uint32_t rightChar = GetLowerUTF8Codepoint(aRight, aRightEnd, aRightNext);
  334. if (MOZ_UNLIKELY(rightChar == uint32_t(-1))) {
  335. *aErr = true;
  336. return false;
  337. }
  338. // Can't have an error past this point.
  339. *aErr = false;
  340. return leftChar == rightChar;
  341. }
  342. namespace mozilla {
  343. uint32_t
  344. HashUTF8AsUTF16(const char* aUTF8, uint32_t aLength, bool* aErr)
  345. {
  346. uint32_t hash = 0;
  347. const char* s = aUTF8;
  348. const char* end = aUTF8 + aLength;
  349. *aErr = false;
  350. while (s < end)
  351. {
  352. uint32_t ucs4 = UTF8CharEnumerator::NextChar(&s, end, aErr);
  353. if (*aErr) {
  354. return 0;
  355. }
  356. if (ucs4 < PLANE1_BASE) {
  357. hash = AddToHash(hash, ucs4);
  358. }
  359. else {
  360. hash = AddToHash(hash, H_SURROGATE(ucs4), L_SURROGATE(ucs4));
  361. }
  362. }
  363. return hash;
  364. }
  365. bool
  366. IsSegmentBreakSkipChar(uint32_t u)
  367. {
  368. return unicode::IsEastAsianWidthFWH(u) &&
  369. unicode::GetScriptCode(u) != unicode::Script::HANGUL;
  370. }
  371. } // namespace mozilla