utf.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. #ifndef __sti__utf_h__
  2. #define __sti__utf_h__
  3. /////////////////////////////////////////////////////////////////////////
  4. //
  5. // NOTE:
  6. //
  7. // This is a basic, general utf8/32 string handling library.
  8. // It is written for speed and efficiency before "correctness".
  9. // It will pass through any characters it does not know about.
  10. // It is aimed at *reasonable* support for *most* modern languages.
  11. // It IS NOT a full-featured, complete, correct unicode library.
  12. // It DOES NOT handle every intricacy of every language, potentially
  13. // including your favorite language.
  14. // It makes no attempt, and never will, to support special features of
  15. // ancient, obscure, or unusual languages.
  16. // Character properties above FFFF are completely unsupported.
  17. //
  18. /////////////////////////////////////////////////////////////////////////
  19. // https://www.unicode.org/reports/tr44/#UnicodeData.txt
  20. #include <string.h>
  21. // utf8 functions
  22. /*
  23. Naming Convention
  24. Mostly derived from string.h
  25. str [n|k] r? case? {operation} [8|32] p? _inplace?
  26. str: consistent prefix
  27. n: limited by bytes
  28. k: limited by codepoints (characters)
  29. r: reverse; starts at the end of the string
  30. case: case-insensitive
  31. operation: see below
  32. 8: operates on utf8
  33. 32: operates on utf32
  34. p: (utf8 only) accepts a pointer to a potentially multibyte encoded sequence instead of a 32-bit codepoint
  35. _inplace: modifies the source buffer
  36. Operations:
  37. cat: append onto existing string; strcat
  38. chr: search for a character; strchr
  39. cmp: compare; strcmp
  40. colwsp: collapse whitespace. All sequences of whitespace are converted into a single copy of the provided character
  41. cpy: copy; strcpy
  42. cspn: return length of inverse prefix substring (postfix for 'r' version)
  43. dup: duplicate into newly allocated memory; strdup
  44. dupa: duplicate onto the stack using alloca
  45. len: calculate length; strlen
  46. pbrk: search for the first of any of a set of characters
  47. rev: reverse the string
  48. spn: return length of prefix substring (postfix for 'r' version)
  49. skip: search for the first character not in a set of characters (strspn, but returns a pointer)
  50. str: search for a substring; strstr
  51. tolower: convert the string to lowercase
  52. toupper: convert the string to uppercase
  53. ltrim: remove a prefix of characters in a set from the beginning of the string
  54. rtrim: remove a suffix of characters in a set from the end of the string
  55. trim: remove a sequence characters in a set from the beginning and end of the string
  56. Arguments:
  57. char* c8; a pointer to a single utf8 character, up to 4 bytes
  58. uint32_t c32; a unicode codepoint (utf32)
  59. size_t blen; operation limit, in bytes
  60. size_t clen; operation limit, in codepoints
  61. "n" versions should not leave behind mutilated multi-byte characters if they hit the limit in the middle of one.
  62. */
  63. // spn functions return the number of characters spanned
  64. // TODO: char* strnchr(const char* s, int c, size_t n); // returns a pointer to the found char, or NULL
  65. // TODO: size_t strncspn(const char* s, const char* reject, size_t n);
  66. // TODO: char* strnpbrk(const char* s, const char* accept, size_t n);
  67. // TODO: size_t strnspn(const char* s, const char* accept, size_t n);
  68. // TODO: char* strrev(char* s); // returns s
  69. // TODO: char* strnrev(const char* s, size_t n); // returns s
  70. // TODO: char* strrpbrk(const char* s, const char* accept); // returns a pointer to the first match character
  71. // TODO: size_t strrcspn(const char* s, const char* reject);
  72. // TODO: size_t strrspn(const char* s, const char* accept);
  73. // TODO: char* strnrpbrk(const char* s, const char* accept, size_t n); // returns a pointer to the first match character
  74. // TODO: size_t strnrcspn(const char* s, const char* reject, size_t n);
  75. // TODO: size_t strnrspn(const char* s, const char* accept, size_t n);
  76. // TODO: char* strltrim(char* s, const char* charset); // moves chars to left, returns s
  77. // TODO: char* strnltrim(char* s, const char* charset, size_t n); // moves chars to left, returns s
  78. // TODO: char* strrtrim(char* s, const char* charset); // moves the null byte to the left, returns s
  79. // TODO: char* strtrim(char* s, const char* charset); // both above, returns s
  80. // TODO: char* strcolwsp(char* s); // does not trim, returns s
  81. // TODO: char* strncolwsp(char* s, size_t n); // does not trim, returns s
  82. // TODO: char* strcolwsptrim(char* s); // also trims, returns s
  83. // TODO: char* strcapwords(char* s); // capitalize the first letter following whitespace, and the beginning of the string, returns s
  84. // TODO: char* strcapsentences(char* s); // capitalize the first letter following terminal punctuation, and the beginning of the string, returns s
  85. // TODO: char* strncapwords(char* s, size_t n) -- capitalize the first letter following whitespace, and the beginning of the string, returns s
  86. // TODO: char* strncapsentences(char* s, size_t n) -- capitalize the first letter following terminal punctuation, and the beginning of the string, returns s
  87. // replace functions should be optimized for very long src strings and short needle and replacement strings
  88. // strreplace(char* src, char* dst, char* needle, char* replacement) // src != dst, replaces all occurrences of needle with replacement
  89. // strnreplace(char* src, char* dst, char* needle, char* replacement, size_t dst_alloc_len) // src != dst
  90. // strreplace_inplace(char* src, char* needle, char* replacement) // modifies src in-place. (must be done in reverse if strlen(replacement) > strlen(needle), counting needles on the forward pass)
  91. // strnreplace_inplace(char* src, char* needle, char* replacement, size_t src_alloc_len) // modifies src in-place. (must be done in reverse if strlen(replacement) > strlen(needle), counting needles on the forward pass)
  92. // format numbers, bytes, money
  93. // limited ('n') versions stop at the null byte or the limit, whichever is first
  94. // edge cases:
  95. // null pointers result in segfault
  96. // empty strings
  97. // 1-char strings
  98. // strings limited at 0
  99. // strings limited at 1
  100. // limited strings that hit the null byte
  101. // returns the number of characters in a utf8 string
  102. size_t charlen8(const char* u8);
  103. // returns a new buffer, caller must free, or NULL when the string is malformed
  104. uint32_t* utf8_to_utf32(uint8_t* u8, size_t* outLen);
  105. // It is the caller's responsibility to provide at least 4 bytes of output memory
  106. // returns the number of bytes used.
  107. int utf32_to_utf8(uint32_t u32, uint8_t* u8_out);
  108. // returns the number of bytes needed to encode this codepoint in utf8
  109. int utf8_bytes_needed(uint32_t u32);
  110. // byte length of a single utf8 character, with subsequent btye format verification and null checks
  111. int utf8_char_size(const char* u8);
  112. // returns 1 if there are multi-byte sequences, 0 otherwise
  113. int utf8_has_multibyte(const uint8_t* u8);
  114. // size_t strcspn8(const char* a, const char* b);
  115. // char* strpbrk8(const char* a, const char* b);
  116. // char* strtok8_r(char* s, const char* delim, char** saveptr);
  117. // char* strnstr8(const char* a, const char* b, size_t len) { return strnstr(a, b, len); }
  118. // char* strcasestr8(const char* a, const char* b);
  119. // some normal string functions are utf8 safe. *8 versions are provided here so you don't have
  120. // to remember which ones are which
  121. inline static size_t strlen8(const char* s) { return strlen(s); }
  122. inline static char* strcat8(char* dst, const char* src) { return strcat(dst, src); }
  123. inline static char* strncat8(char* dst, const char* src, size_t len) { return strncat(dst, src, len); }
  124. char* strkcat8(char* dst, const char* src, size_t clen);
  125. // returns NULL on not found or if codepoint is invalid
  126. char* strchr8(const char* s, uint32_t c32);
  127. // c8 is a pointer to a single utf8 character, up to 4 bytes
  128. // returns NULL on not found or if codepoint is invalid
  129. char* strchr8p(const char* s, const char* c8);
  130. char* strrchr8(const char* s, uint32_t c32);
  131. char* strrchr8p(const char* s, const char* c8);
  132. char* strnchr8(const char* s, uint32_t c32, size_t blen);
  133. inline static char* strcpy8(char* dst, const char* src) { return strcpy(dst, src); }
  134. inline static char* strncpy8(char* dst, const char* src, size_t blen) { return strncpy(dst, src, blen); }
  135. char* strkcpy8(char* dst, const char* src, size_t clen);
  136. inline static int strcmp8(const char* a, const char* b) { return strcmp(a, b); }
  137. inline static int strncmp8(const char* a, const char* b, size_t len) { return strncmp(a, b, len); }
  138. inline static char* strstr8(const char* a, const char* b) { return strstr(a, b); }
  139. inline static char* strdup8(const char* const s) { return strdup(s); }
  140. inline static char* strndup8(const char* const s, size_t len) { return strndup(s, len); }
  141. // inline static char* strtok_r8(const char* const s, size_t len) { return strndup(s, len); }
  142. // strtok intentionally not implemented
  143. inline static char* strtok8(char* s, const char* delim) {
  144. (void)s;
  145. (void)delim;
  146. *((int*)0) = 0xBadBad; // segfault on purpose because non-reentrant fns are bad. use strtok_r.
  147. return NULL;
  148. }
  149. // why even bother with utf16? all downsides, no upsides.
  150. // utf32 functions
  151. // in bytes, not including (4-byte) null terminator
  152. size_t strlen32(const uint32_t* const s);
  153. // in characters
  154. size_t charlen32(const uint32_t* const s);
  155. uint32_t* strcat32(uint32_t* dst, const uint32_t* src);
  156. uint32_t* strncat32(uint32_t* dst, const uint32_t* src, size_t len);
  157. uint32_t* strcpy32(uint32_t* dst, const uint32_t* src);
  158. uint32_t* strncpy32(uint32_t* dst, const uint32_t* src, size_t len);
  159. uint32_t* strchr32(const uint32_t* s, uint32_t c);
  160. uint32_t* strrchr32(const uint32_t* s, uint32_t c);
  161. uint32_t* strchrnul32(uint32_t* s, uint32_t c);
  162. int strcmp32(const uint32_t* a, const uint32_t* b);
  163. int strncmp32(const uint32_t* a, const uint32_t* b, size_t len);
  164. size_t strspn32(const uint32_t* s, const uint32_t* accept);
  165. size_t strcspn32(const uint32_t* s, const uint32_t* reject);
  166. // uint32_t* strpbrk32(const uint32_t* a, const uint32_t* b);
  167. // uint32_t* strstr32(const uint32_t* a, const uint32_t* b);
  168. // uint32_t* strnstr32(const uint32_t* a, const uint32_t* b);
  169. uint32_t* strdup32(const uint32_t* const s);
  170. // uint32_t* strndup32(const uint32_t* const s);
  171. // uint32_t* strtok_r32(uint32_t* s, const uint32_t* delim, uint32_t** saveptr);
  172. // strtok intentionally not implemented
  173. inline static uint32_t* strtok32(uint32_t* s, const uint32_t* delim) {
  174. (void)s;
  175. (void)delim;
  176. *((int*)0) = 0xBadBad; // segfault on purpose because non-reentrant fns are bad. use strtok_r.
  177. return NULL;
  178. }
  179. // http://www.unicode.org/reports/tr44/#UnicodeData.txt
  180. // TODO:
  181. // toupper/tolower for whole strings
  182. // Capitalize words
  183. // Character class info
  184. // Strip emoji
  185. // Convert all emoji to random list of supplied emoji
  186. // printf
  187. // trim
  188. #endif // __sti__utf_h__