utf.c 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. #include <stddef.h>
  2. #include <stdint.h>
  3. #include <string.h>
  4. #include "utf.h"
  5. #define to32s(x, s) (((uint32_t)(x)) << (s))
  6. uint32_t* utf8_to_utf32(uint8_t* u8, size_t* outLen) {
  7. size_t u8len = strlen((char*)u8);
  8. uint32_t* u32 = malloc((u8len + 1) * sizeof(u32)); // just overallocate
  9. uint8_t* s = (uint8_t*)u8;
  10. int i = 0;
  11. while(*s) {
  12. if((s[0] & 0x80) == 0x00) { // single byte
  13. u32[i] = s[0];
  14. s++;
  15. }
  16. else if((s[0] & 0xe0) == 0xc0) { // two bytes
  17. if(s[1] == 0) goto MALFORMED;
  18. u32[i] = to32s(s[0] & 0x1f, 6) | to32s(s[1] & 0x3f, 0);
  19. s += 2;
  20. }
  21. else if((s[0] & 0xf0) == 0xe0) { // three bytes
  22. if(s[1] == 0 || s[2] == 0) goto MALFORMED;
  23. u32[i] = to32s(s[0] & 0x1f, 12) | to32s(s[1] & 0x3f, 6) | to32s(s[2] & 0x3f, 0);
  24. s += 3;
  25. }
  26. else if((s[0] & 0xf8) == 0xf0) { // four bytes
  27. if(s[1] == 0 || s[2] == 0 || s[3] == 0) goto MALFORMED;
  28. u32[i] = to32s(s[0] & 0x1f, 18) | to32s(s[1] & 0x3f, 12) | to32s(s[2] & 0x3f, 6) | to32s(s[3] & 0x3f, 0);
  29. s += 4;
  30. }
  31. i++;
  32. }
  33. u32[i] = 0;
  34. if(outLen) *outLen = i;
  35. return u32;
  36. MALFORMED:
  37. fprintf(stderr, "Malformed UTF-8 sequence.\n");
  38. // exit(1); // we ain't havin none of that shit
  39. free(u32);
  40. return NULL;
  41. }
  42. // It is the caller's responsibility to provide at least 4 bytes of output memory
  43. // returns the number of bytes used.
  44. int utf32_to_utf8(uint32_t u32, uint8_t* u8_out) {
  45. if(u32 < 0x80) { // one byte
  46. u8_out[0] = u32;
  47. return 1;
  48. }
  49. else if(u32 < 0x800) { // two bytes
  50. u8_out[0] = ((u32 >> 6) & 0x1f) | 0xc0;
  51. u8_out[1] = (u32 & 0x3f) | 0x80;
  52. return 2;
  53. }
  54. else if(u32 < 0x10000) { // three bytes
  55. u8_out[0] = ((u32 >> 12) & 0x0f) | 0xe0;
  56. u8_out[1] = ((u32 >> 6) & 0x3f) | 0x80;
  57. u8_out[2] = (u32 & 0x3f) | 0x80;
  58. return 3;
  59. }
  60. else if(u32 <= 0x7ffffff) { // four bytes. we gleefully encode up to the physical limit because they'll probably expand to it in the future.
  61. u8_out[0] = ((u32 >> 18) & 0x07) | 0xf0;
  62. u8_out[1] = ((u32 >> 12) & 0x3f) | 0x80;
  63. u8_out[2] = ((u32 >> 6) & 0x3f) | 0x80;
  64. u8_out[3] = (u32 & 0x3f) | 0x80;
  65. return 4;
  66. }
  67. return 5;
  68. }
  69. int utf8_bytes_needed(uint32_t u32) {
  70. if(u32 < 0x80) { // one byte
  71. return 1;
  72. }
  73. else if(u32 < 0x800) { // two bytes
  74. return 2;
  75. }
  76. else if(u32 < 0x10000) { // three bytes
  77. return 3;
  78. }
  79. else if(u32 <= 0x7ffffff) { // four bytes. we gleefully encode up to the physical limit because they'll probably expand to it in the future.
  80. return 4;
  81. }
  82. return 5;
  83. }
  84. // returns the number of characters in a utf8 string
  85. size_t charlen8(const char* u8) {
  86. size_t len = 0;
  87. uint8_t* s = (uint8_t*)u8;
  88. while(*s) {
  89. if((s[0] & 0x80) == 0x00) { // single byte
  90. s++;
  91. }
  92. else if((s[0] & 0xe0) == 0xc0) { // two bytes
  93. if(s[1] == 0) break; // malformed sequence
  94. s += 2;
  95. }
  96. else if((s[0] & 0xf0) == 0xe0) { // three bytes
  97. if(s[1] == 0 || s[2] == 0) break; // malformed sequence
  98. s += 3;
  99. }
  100. else if((s[0] & 0xf8) == 0xf0) { // four bytes
  101. if(s[1] == 0 || s[2] == 0 || s[3] == 0) break; // malformed sequence
  102. s += 4;
  103. }
  104. len++;
  105. }
  106. return len;
  107. }
  108. // returns 1 if there are multi-byte sequences, 0 otherwise
  109. int utf8_has_multibyte(const uint8_t* u8) {
  110. uint8_t* s = (uint8_t*)u8;
  111. while(*s) {
  112. if((s[0] & 0x80) != 0x00) {
  113. return 1;
  114. }
  115. s++;
  116. }
  117. return 0;
  118. }
  119. // byte length of a single utf8 character, with subsequent btye format verification and null checks
  120. int utf8_char_size(const char* u8) {
  121. if((u8[0] & 0x80) == 0x00) { // single byte
  122. return 1;
  123. }
  124. else if((u8[0] & 0xe0) == 0xc0) { // two bytes
  125. if((u8[1] == 0) || ((u8[1] & 0xc0) != 0x80)) goto MALFORMED_1;
  126. return 2;
  127. }
  128. else if((u8[0] & 0xf0) == 0xe0) { // three bytes
  129. if((u8[1] == 0) || ((u8[1] & 0xc0) != 0x80)) goto MALFORMED_1;
  130. if((u8[2] == 0) || ((u8[2] & 0xc0) != 0x80)) goto MALFORMED_2;
  131. return 3;
  132. }
  133. else if((u8[0] & 0xf8) == 0xf0) { // four bytes
  134. if((u8[1] == 0) || ((u8[1] & 0xc0) != 0x80)) goto MALFORMED_1;
  135. if((u8[2] == 0) || ((u8[2] & 0xc0) != 0x80)) goto MALFORMED_2;
  136. if((u8[3] == 0) || ((u8[3] & 0xc0) != 0x80)) goto MALFORMED_3;
  137. return 4;
  138. }
  139. // the character lies outside of known utf8 encodings.
  140. // just eat one byte at a times and hope for recovery
  141. return 1;
  142. MALFORMED_3:
  143. return 3;
  144. MALFORMED_2:
  145. return 2;
  146. MALFORMED_1:
  147. return 1;
  148. }
  149. char* strkcat8(char* dst, const char* src, size_t clen) {
  150. size_t b = 0; // bytes
  151. uint8_t* ud = (uint8_t*)dst;
  152. uint8_t* us = (uint8_t*)src;
  153. while(*ud) ud++; // skip to the end of dst
  154. for(size_t c = 0; c < clen; c++) {
  155. int sz = utf8_char_size((char*)us + b);
  156. for(int i = 0; i < sz; i++) {
  157. if(!us[b]) goto NULL_TERM;
  158. ud[b] = us[b];
  159. b++;
  160. }
  161. }
  162. NULL_TERM:
  163. ud[b] = 0;
  164. return dst;
  165. }
  166. // returns NULL on not found or if codepoint is invalid
  167. char* strchr8(const char* s, uint32_t c32) {
  168. uint8_t c8[5];
  169. int sz;
  170. sz = utf32_to_utf8(c32, c8);
  171. switch(sz) {
  172. case 1: return strchr(s, c32);
  173. case 2:
  174. case 3:
  175. case 4:
  176. c8[sz] = 0;
  177. return strstr(s, (char*)c8);
  178. default:
  179. return NULL;
  180. }
  181. }
  182. char* strrchr8(const char* s, uint32_t c32) {
  183. uint8_t c8[5];
  184. int sz;
  185. uint8_t* us = (uint8_t*)s;
  186. sz = utf32_to_utf8(c32, c8);
  187. const uint8_t* p = NULL;
  188. for(; *us; us++) {
  189. if(*us == c8[0]) {
  190. // check for a full match of the entire sequence
  191. for(int n = 1;; n++) {
  192. if(n >= sz) {
  193. // success. save the starting spot
  194. p = us;
  195. break;
  196. }
  197. if(us[n] != c8[n]) break; // match failed
  198. }
  199. }
  200. }
  201. return (char*)p;
  202. }
  203. // c is a pointer to a single utf8 character, up to 4 bytes
  204. // returns NULL on not found or if codepoint is invalid
  205. char* strchr8p(const char* s, const char* c) {
  206. uint8_t c8[5];
  207. int sz;
  208. sz = utf8_char_size(c);
  209. for(int i = 0; i < sz; i++) c8[i] = c[i];
  210. c8[sz] = 0;
  211. return strstr(s, (char*)c8);
  212. }
  213. // c is a pointer to a single utf8 character, up to 4 bytes
  214. // returns NULL on not found or if codepoint is invalid
  215. char* strrchr8p(const char* s, const char* c) {
  216. int sz;
  217. uint8_t* us = (uint8_t*)s;
  218. sz = utf8_char_size(c);
  219. const uint8_t* p = NULL;
  220. for(; *us; us++) {
  221. if(*us == c[0]) {
  222. // check for a full match of the entire sequence
  223. for(int n = 1;; n++) {
  224. if(n >= sz) {
  225. // success. save the starting spot
  226. p = us;
  227. break;
  228. }
  229. if(us[n] != c[n]) break; // match failed
  230. }
  231. }
  232. }
  233. return (char*)p;
  234. }
  235. char* strnchr8(const char* src, uint32_t c32, size_t blen) {
  236. uint8_t c8[5];
  237. int sz;
  238. uint8_t* us = (uint8_t*)src;
  239. sz = utf32_to_utf8(c32, c8);
  240. for(int b = 0; b < blen; b++) {
  241. if(b >= blen) return NULL;
  242. for(int x = 0; x < sz; x++) {
  243. if(c8[x] != us[b + x] || b + x >= blen) {
  244. // failed match
  245. goto RETRY;
  246. }
  247. }
  248. return (char*)(us + b);
  249. RETRY:
  250. }
  251. return NULL;
  252. }
  253. char* strkcpy8(char* dst, const char* src, size_t clen) {
  254. size_t b = 0; // bytes
  255. uint8_t* ud = (uint8_t*)dst;
  256. uint8_t* us = (uint8_t*)src;
  257. for(size_t c = 0; c < clen; c++) {
  258. int sz = utf8_char_size((char*)us + b);
  259. for(int i = 0; i < sz; i++) {
  260. if(!us[b]) goto NULL_TERM;
  261. ud[b] = us[b];
  262. b++;
  263. }
  264. }
  265. NULL_TERM:
  266. ud[b] = 0;
  267. return dst;
  268. }
  269. // in bytes, not including (4-byte) null terminator
  270. size_t strlen32(const uint32_t* s) {
  271. const uint32_t* e = s;
  272. while(*e) e++;
  273. return (e - s);
  274. }
  275. // in characters
  276. size_t charlen32(const uint32_t* s) {
  277. return strlen32(s) >> 2;
  278. }
  279. uint32_t* strcat32(uint32_t* dst, const uint32_t* src) {
  280. uint32_t* d = dst;
  281. const uint32_t* s = src;
  282. while(*d) d++;
  283. while(*s) *d++ = *s++;
  284. *d = 0;
  285. return dst;
  286. }
  287. uint32_t* strncat32(uint32_t* dst, const uint32_t* src, size_t len) {
  288. uint32_t* d = dst;
  289. const uint32_t* s = src;
  290. while(*d) d++;
  291. while(*s && len--) *d++ = *s++;
  292. *d = 0;
  293. return dst;
  294. }
  295. uint32_t* strcpy32(uint32_t* dst, const uint32_t* src) {
  296. uint32_t* d = dst;
  297. const uint32_t* s = src;
  298. while(*s) *d++ = *s++;
  299. *d = 0;
  300. return dst;
  301. }
  302. uint32_t* strncpy32(uint32_t* dst, const uint32_t* src, size_t len) {
  303. uint32_t* d = dst;
  304. const uint32_t* s = src;
  305. while(*s && len--) *d++ = *s++;
  306. *d = 0;
  307. return dst;
  308. }
  309. uint32_t* strchr32(const uint32_t* s, uint32_t c) {
  310. for(; *s; s++) if(*s == c) return (uint32_t*)s;
  311. return NULL;
  312. }
  313. uint32_t* strrchr32(const uint32_t* s, uint32_t c) {
  314. const uint32_t* p = NULL;
  315. for(; *s; s++) if(*s == c) p = s;
  316. return (uint32_t*)p;
  317. }
  318. uint32_t* strchrnul32(uint32_t* s, uint32_t c) {
  319. for(; *s; s++) if(*s == c) return s;
  320. return s;
  321. }
  322. int strcmp32(const uint32_t* a, const uint32_t* b) {
  323. for(; *a || *b; a++, b++) {
  324. if(*a < *b) return -1;
  325. if(*a > *b) return 1;
  326. }
  327. return 0;
  328. }
  329. int strncmp32(const uint32_t* a, const uint32_t* b, size_t len) {
  330. for(; (*a || *b) && len--; a++, b++) {
  331. if(*a < *b) return -1;
  332. if(*a > *b) return 1;
  333. }
  334. return 0;
  335. }
  336. size_t strspn32(const uint32_t* s, const uint32_t* accept) {
  337. const uint32_t* start, *a;;
  338. for(start = s; *s; ) {
  339. for(a = accept; *a; a++) {
  340. if(*a == *s) goto CONT;
  341. }
  342. goto END;
  343. CONT:
  344. s++;
  345. }
  346. END:
  347. return s - start;
  348. }
  349. size_t strcspn32(const uint32_t* s, const uint32_t* accept) {
  350. const uint32_t* start, *a;;
  351. for(start = s; *s; ) {
  352. for(a = accept; *a; a++) {
  353. if(*a == *s) goto END;
  354. }
  355. s++;
  356. }
  357. END:
  358. return s - start;
  359. }
  360. uint32_t* strdup32(const uint32_t* const s) {
  361. size_t l = strlen32(s);
  362. uint32_t* o = malloc(l + sizeof(*s)); // +sz for the null terminator
  363. memcpy(o, s, (l + 1) * sizeof(*s));
  364. return o;
  365. }