nsUnicodeToUTF8.cpp 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
  2. /* This Source Code Form is subject to the terms of the Mozilla Public
  3. * License, v. 2.0. If a copy of the MPL was not distributed with this
  4. * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  5. //----------------------------------------------------------------------
  6. // Global functions and data [declaration]
  7. #include "nsUnicodeToUTF8.h"
  8. #include "mozilla/CheckedInt.h"
  9. NS_IMPL_ISUPPORTS(nsUnicodeToUTF8, nsIUnicodeEncoder)
  10. //----------------------------------------------------------------------
  11. // nsUnicodeToUTF8 class [implementation]
  12. NS_IMETHODIMP nsUnicodeToUTF8::GetMaxLength(const char16_t* aSrc,
  13. int32_t aSrcLength,
  14. int32_t* aDestLength)
  15. {
  16. MOZ_ASSERT(aDestLength);
  17. // aSrc is interpreted as UTF16, 3 is normally enough.
  18. // But when previous buffer only contains part of the surrogate pair, we
  19. // need to complete it here. If the first word in following buffer is not
  20. // in valid surrogate range, we need to convert the remaining of last buffer
  21. // to 3 bytes.
  22. mozilla::CheckedInt32 length = aSrcLength;
  23. length *= 3;
  24. length += 3;
  25. if (!length.isValid()) {
  26. return NS_ERROR_OUT_OF_MEMORY;
  27. }
  28. *aDestLength = length.value();
  29. return NS_OK;
  30. }
  31. NS_IMETHODIMP nsUnicodeToUTF8::Convert(const char16_t* aSrc,
  32. int32_t* aSrcLength,
  33. char* aDest,
  34. int32_t* aDestLength)
  35. {
  36. const char16_t* src = aSrc;
  37. const char16_t* srcEnd = aSrc + *aSrcLength;
  38. char* dest = aDest;
  39. int32_t destLen = *aDestLength;
  40. uint32_t n;
  41. //complete remaining of last conversion
  42. if (mHighSurrogate) {
  43. if (src < srcEnd) {
  44. *aDestLength = 0;
  45. return NS_OK_UENC_MOREINPUT;
  46. }
  47. if (*aDestLength < 4) {
  48. *aSrcLength = 0;
  49. *aDestLength = 0;
  50. return NS_OK_UENC_MOREOUTPUT;
  51. }
  52. if (*src < (char16_t)0xdc00 || *src > (char16_t)0xdfff) { //not a pair
  53. *dest++ = (char)0xef; //replacement character
  54. *dest++ = (char)0xbf;
  55. *dest++ = (char)0xbd;
  56. destLen -= 3;
  57. } else {
  58. n = ((mHighSurrogate - (char16_t)0xd800) << 10) +
  59. (*src - (char16_t)0xdc00) + 0x10000;
  60. *dest++ = (char)0xf0 | (n >> 18);
  61. *dest++ = (char)0x80 | ((n >> 12) & 0x3f);
  62. *dest++ = (char)0x80 | ((n >> 6) & 0x3f);
  63. *dest++ = (char)0x80 | (n & 0x3f);
  64. ++src;
  65. destLen -= 4;
  66. }
  67. mHighSurrogate = 0;
  68. }
  69. while (src < srcEnd) {
  70. if ( *src <= 0x007f) {
  71. if (destLen < 1)
  72. goto error_more_output;
  73. *dest++ = (char)*src;
  74. --destLen;
  75. } else if (*src <= 0x07ff) {
  76. if (destLen < 2)
  77. goto error_more_output;
  78. *dest++ = (char)0xc0 | (*src >> 6);
  79. *dest++ = (char)0x80 | (*src & 0x003f);
  80. destLen -= 2;
  81. } else if (*src >= (char16_t)0xd800 && *src <= (char16_t)0xdfff) {
  82. if (*src >= (char16_t)0xdc00) { //not a pair
  83. if (destLen < 3)
  84. goto error_more_output;
  85. *dest++ = (char)0xef; //replacement character
  86. *dest++ = (char)0xbf;
  87. *dest++ = (char)0xbd;
  88. destLen -= 3;
  89. ++src;
  90. continue;
  91. }
  92. if ((src+1) >= srcEnd) {
  93. //we need another surrogate to complete this unicode char
  94. mHighSurrogate = *src;
  95. *aDestLength = dest - aDest;
  96. return NS_OK_UENC_MOREINPUT;
  97. }
  98. //handle surrogate
  99. if (destLen < 4)
  100. goto error_more_output;
  101. if (*(src+1) < (char16_t)0xdc00 || *(src+1) > 0xdfff) { //not a pair
  102. *dest++ = (char)0xef; //replacement character
  103. *dest++ = (char)0xbf;
  104. *dest++ = (char)0xbd;
  105. destLen -= 3;
  106. } else {
  107. n = ((*src - (char16_t)0xd800) << 10) + (*(src+1) - (char16_t)0xdc00) + (uint32_t)0x10000;
  108. *dest++ = (char)0xf0 | (n >> 18);
  109. *dest++ = (char)0x80 | ((n >> 12) & 0x3f);
  110. *dest++ = (char)0x80 | ((n >> 6) & 0x3f);
  111. *dest++ = (char)0x80 | (n & 0x3f);
  112. destLen -= 4;
  113. ++src;
  114. }
  115. } else {
  116. if (destLen < 3)
  117. goto error_more_output;
  118. //treat rest of the character as BMP
  119. *dest++ = (char)0xe0 | (*src >> 12);
  120. *dest++ = (char)0x80 | ((*src >> 6) & 0x003f);
  121. *dest++ = (char)0x80 | (*src & 0x003f);
  122. destLen -= 3;
  123. }
  124. ++src;
  125. }
  126. *aDestLength = dest - aDest;
  127. return NS_OK;
  128. error_more_output:
  129. *aSrcLength = src - aSrc;
  130. *aDestLength = dest - aDest;
  131. return NS_OK_UENC_MOREOUTPUT;
  132. }
  133. NS_IMETHODIMP nsUnicodeToUTF8::Finish(char * aDest, int32_t * aDestLength)
  134. {
  135. char * dest = aDest;
  136. if (mHighSurrogate) {
  137. if (*aDestLength < 3) {
  138. *aDestLength = 0;
  139. return NS_OK_UENC_MOREOUTPUT;
  140. }
  141. *dest++ = (char)0xef; //replacement character
  142. *dest++ = (char)0xbf;
  143. *dest++ = (char)0xbd;
  144. mHighSurrogate = 0;
  145. *aDestLength = 3;
  146. return NS_OK;
  147. }
  148. *aDestLength = 0;
  149. return NS_OK;
  150. }