utf8.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. //@+leo-ver=5-thin
  2. //@+node:caminhante.20200309141148.1: * @file ./utf8.c
  3. //@@tabwidth -2
  4. #include "utf8.h"
  5. //@+others
  6. //@+node:caminhante.20200309141148.3: ** static ascii_char
  7. static bool ascii_char (const char *source) {
  8. switch (source[0]) {
  9. case '\x09': case '\x0A': case '\x0D': return true;
  10. case '\x20'...'\x7E': return true;
  11. default: return false; }
  12. }
  13. //@+node:caminhante.20231212193346.1: ** static utf8_multibyte_ending
  14. static bool utf8_multibyte_ending (const char *source) {
  15. switch (source[0]) {
  16. case '\x80'...'\xBF': return true;
  17. default: return false; }
  18. }
  19. //@+node:caminhante.20231212193009.1: ** static utf8_2bytes_notoverlong
  20. static bool utf8_2bytes_notoverlong (const char *source) {
  21. switch (source[0]) {
  22. case '\xC2'...'\xDF': return utf8_multibyte_ending(source+1);
  23. default: return false; }
  24. }
  25. //@+node:caminhante.20231212193621.1: ** static utf8_3bytes_notoverlong
  26. static bool utf8_3bytes_notoverlong (const char *source) {
  27. switch (source[0]) {
  28. case '\xE0':
  29. switch (source[1]) {
  30. case '\xA0'...'\xBF': return utf8_multibyte_ending(source+2);
  31. default: return false; }
  32. default: return false; }
  33. }
  34. //@+node:caminhante.20231212193939.1: ** static utf8_3bytes
  35. static bool utf8_3bytes (const char *source) {
  36. switch (source[0]) {
  37. case '\xE1'...'\xEC': case '\xEE': case '\xEF':
  38. return utf8_multibyte_ending(source+1) && utf8_multibyte_ending(source+2);
  39. default: return false; }
  40. }
  41. //@+node:caminhante.20231212194252.1: ** static utf8_3bytes_notsurrogate
  42. static bool utf8_3bytes_notsurrogate (const char *source) {
  43. switch (source[0]) {
  44. case '\xED':
  45. switch (source[1]) {
  46. case '\x80'...'\x9F': return utf8_multibyte_ending(source+2);
  47. default: return false; }
  48. default: return false; }
  49. }
  50. //@+node:caminhante.20231212194457.1: ** static utf8_4bytes_planes1to3
  51. static bool utf8_4bytes_planes1to3 (const char *source) {
  52. switch (source[0]) {
  53. case '\xF0':
  54. switch (source[1]) {
  55. case '\x90'...'\xBF':
  56. return utf8_multibyte_ending(source+2) && utf8_multibyte_ending(source+3);
  57. default: return false; }
  58. default: return false; }
  59. }
  60. //@+node:caminhante.20231212194735.1: ** static utf8_4bytes_planes4to15
  61. static bool utf8_4bytes_planes4to15 (const char *source) {
  62. switch (source[0]) {
  63. case '\xF1'...'\xF3':
  64. return utf8_multibyte_ending(source+1) && utf8_multibyte_ending(source+2) && utf8_multibyte_ending(source+3);
  65. default: return false; }
  66. }
  67. //@+node:caminhante.20231212195030.1: ** static utf8_4bytes_plane16
  68. static bool utf8_4bytes_plane16 (const char *source) {
  69. switch (source[0]) {
  70. case '\xF4':
  71. switch (source[1]) {
  72. case '\x80'...'\x8F':
  73. return utf8_multibyte_ending(source+2) && utf8_multibyte_ending(source+3);
  74. default: return false; }
  75. default: return false; }
  76. }
  77. //@+node:caminhante.20200309141148.7: ** uchar_valid
  78. bool uchar_valid (char* source) {
  79. return ascii_char(source) ||
  80. utf8_2bytes_notoverlong(source) ||
  81. utf8_3bytes_notoverlong(source) || utf8_3bytes(source) || utf8_3bytes_notsurrogate(source) ||
  82. utf8_4bytes_planes1to3(source) || utf8_4bytes_planes4to15(source) || utf8_4bytes_plane16(source);
  83. }
  84. //@+node:caminhante.20200309141148.8: ** uchar_bytes
  85. size_t uchar_bytes (char* source) {
  86. if (ascii_char(source)) {return 1;}
  87. if (utf8_2bytes_notoverlong(source)) {return 2;}
  88. if (utf8_3bytes_notoverlong(source) || utf8_3bytes(source) || utf8_3bytes_notsurrogate(source)) {return 3;}
  89. if (utf8_4bytes_planes1to3(source) || utf8_4bytes_planes4to15(source) || utf8_4bytes_plane16(source)) {return 4;}
  90. return 0;
  91. }
  92. //@+node:caminhante.20200309141148.9: ** ustring_length
  93. size_t ustring_length (char* source) {
  94. size_t length = 0, a, p=0;
  95. do {
  96. a = uchar_bytes(source+p);
  97. p += a;
  98. if (a) {length++;}
  99. } while (a);
  100. return length;
  101. }
  102. //@+node:caminhante.20200309141148.10: ** ustring_bytes
  103. size_t ustring_bytes (char* source) {
  104. size_t a, p=0;
  105. do {
  106. a = uchar_bytes(source+p);
  107. p += a;
  108. } while (a);
  109. return p;
  110. }
  111. //@+node:caminhante.20200309141148.11: ** cstring_bytes
  112. size_t cstring_bytes (struct uchar* source) {
  113. size_t a = 0;
  114. while(source->bytes) {
  115. a += source->bytes;
  116. source++;
  117. }
  118. return a;
  119. }
  120. //@+node:caminhante.20200309141148.12: ** next_uchar
  121. struct uchar next_uchar (char* source) {
  122. size_t bytes = uchar_bytes(source);
  123. if (bytes == 0) {return (struct uchar){0};}
  124. struct uchar uc = (struct uchar){
  125. .bytes=bytes,
  126. .chars[0]=source[0],
  127. .chars[1]=(bytes>=2 ? source[1] : 0),
  128. .chars[2]=(bytes>=3 ? source[2] : 0),
  129. .chars[3]=(bytes==4 ? source[3] : 0)
  130. };
  131. return uc;
  132. }
  133. //@+node:caminhante.20200309141148.13: ** c_to_ustring
  134. void c_to_ustring (char* source, struct uchar* destination) {
  135. char *p = source;
  136. struct uchar uc, *us = destination;
  137. do {
  138. uc = next_uchar(p);
  139. p += uc.bytes;
  140. *us = uc;
  141. us ++;
  142. } while (uc.bytes);
  143. }
  144. //@+node:caminhante.20200309141148.14: ** u_to_cstring
  145. void u_to_cstring (struct uchar* source, char* destination) {
  146. struct uchar *us = source;
  147. char *p = destination;
  148. while (us->bytes) {
  149. memcpy(p,us->chars,us->bytes);
  150. p += us->bytes;
  151. us ++;
  152. }
  153. p[0] = '\0';
  154. }
  155. //@+node:caminhante.20200309141148.15: ** uchar_puts
  156. size_t uchar_puts (int fileno, struct uchar *uc) {
  157. return write(fileno, uc->chars, uc->bytes);
  158. }
  159. //@+node:caminhante.20200309141148.16: ** ustring_puts
  160. size_t ustring_puts (int fileno, struct uchar *ustring) {
  161. size_t written = 0;
  162. while (ustring->bytes != 0) {
  163. size_t a = uchar_puts(fileno,ustring);
  164. written +=a;
  165. if (a < ustring->bytes) {break;}
  166. ustring++;
  167. }
  168. return written;
  169. }
  170. //@-others
  171. //@-leo