ustr_imp.h 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 1999-2015, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: ustr_imp.h
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2001jan30
  14. * created by: Markus W. Scherer
  15. */
  16. #ifndef __USTR_IMP_H__
  17. #define __USTR_IMP_H__
  18. #include "unicode/utypes.h"
  19. #include "unicode/utf8.h"
  20. /**
  21. * Internal option for unorm_cmpEquivFold() for strncmp style.
  22. * If set, checks for both string length and terminating NUL.
  23. */
  24. #define _STRNCMP_STYLE 0x1000
  25. /**
  26. * Compare two strings in code point order or code unit order.
  27. * Works in strcmp style (both lengths -1),
  28. * strncmp style (lengths equal and >=0, flag true),
  29. * and memcmp/UnicodeString style (at least one length >=0).
  30. */
  31. U_CFUNC int32_t U_EXPORT2
  32. uprv_strCompare(const UChar *s1, int32_t length1,
  33. const UChar *s2, int32_t length2,
  34. UBool strncmpStyle, UBool codePointOrder);
  35. U_CAPI int32_t U_EXPORT2
  36. ustr_hashUCharsN(const UChar *str, int32_t length);
  37. U_CAPI int32_t U_EXPORT2
  38. ustr_hashCharsN(const char *str, int32_t length);
  39. U_CAPI int32_t U_EXPORT2
  40. ustr_hashICharsN(const char *str, int32_t length);
  41. /**
  42. * Convert an ASCII-range lowercase character to uppercase.
  43. *
  44. * @param c A UChar.
  45. * @return If UChar is a lowercase ASCII character, returns the uppercase version.
  46. * Otherwise, returns the input character.
  47. */
  48. U_CAPI UChar U_EXPORT2
  49. u_asciiToUpper(UChar c);
  50. // TODO: Add u_asciiToLower if/when there is a need for it.
  51. /**
  52. * NUL-terminate a UChar * string if possible.
  53. * If length < destCapacity then NUL-terminate.
  54. * If length == destCapacity then do not terminate but set U_STRING_NOT_TERMINATED_WARNING.
  55. * If length > destCapacity then do not terminate but set U_BUFFER_OVERFLOW_ERROR.
  56. *
  57. * @param dest Destination buffer, can be NULL if destCapacity==0.
  58. * @param destCapacity Number of UChars available at dest.
  59. * @param length Number of UChars that were (to be) written to dest.
  60. * @param pErrorCode ICU error code.
  61. * @return length
  62. */
  63. U_CAPI int32_t U_EXPORT2
  64. u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);
  65. /**
  66. * NUL-terminate a char * string if possible.
  67. * Same as u_terminateUChars() but for a different string type.
  68. */
  69. U_CAPI int32_t U_EXPORT2
  70. u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);
  71. /**
  72. * NUL-terminate a UChar32 * string if possible.
  73. * Same as u_terminateUChars() but for a different string type.
  74. */
  75. U_CAPI int32_t U_EXPORT2
  76. u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);
  77. /**
  78. * NUL-terminate a wchar_t * string if possible.
  79. * Same as u_terminateUChars() but for a different string type.
  80. */
  81. U_CAPI int32_t U_EXPORT2
  82. u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode);
  83. /**
  84. * Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
  85. * Returns 1 for ASCII 0..0x7f.
  86. * Returns 0 for 0x80..0xc1 as well as for 0xf5..0xff.
  87. * leadByte might be evaluated multiple times.
  88. *
  89. * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
  90. * @return 0..4
  91. */
  92. #define U8_COUNT_BYTES(leadByte) \
  93. (U8_IS_SINGLE(leadByte) ? 1 : U8_COUNT_BYTES_NON_ASCII(leadByte))
  94. /**
  95. * Counts the bytes of any whole valid sequence for a UTF-8 lead byte.
  96. * Returns 0 for 0x00..0xc1 as well as for 0xf5..0xff.
  97. * leadByte might be evaluated multiple times.
  98. *
  99. * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
  100. * @return 0 or 2..4
  101. */
  102. #define U8_COUNT_BYTES_NON_ASCII(leadByte) \
  103. (U8_IS_LEAD(leadByte) ? ((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+2 : 0)
  104. #ifdef __cplusplus
  105. U_NAMESPACE_BEGIN
  106. class UTF8 {
  107. public:
  108. UTF8() = delete; // all static
  109. /**
  110. * Is t a valid UTF-8 trail byte?
  111. *
  112. * @param prev Must be the preceding lead byte if i==1 and length>=3;
  113. * otherwise ignored.
  114. * @param t The i-th byte following the lead byte.
  115. * @param i The index (1..3) of byte t in the byte sequence. 0<i<length
  116. * @param length The length (2..4) of the byte sequence according to the lead byte.
  117. * @return true if t is a valid trail byte in this context.
  118. */
  119. static inline UBool isValidTrail(int32_t prev, uint8_t t, int32_t i, int32_t length) {
  120. // The first trail byte after a 3- or 4-byte lead byte
  121. // needs to be validated together with its lead byte.
  122. if (length <= 2 || i > 1) {
  123. return U8_IS_TRAIL(t);
  124. } else if (length == 3) {
  125. return U8_IS_VALID_LEAD3_AND_T1(prev, t);
  126. } else { // length == 4
  127. return U8_IS_VALID_LEAD4_AND_T1(prev, t);
  128. }
  129. }
  130. };
  131. U_NAMESPACE_END
  132. #endif // __cplusplus
  133. #endif