utf8.h 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. //@+leo-ver=5-thin
  2. //@+node:caminhante.20200309141158.1: * @file ./utf8.h
  3. //@@tabwidth -2
  4. #ifndef _UTF8_H_
  5. #define _UTF8_H_
  6. #include <stdlib.h>
  7. #include <stdio.h>
  8. #include <stdint.h>
  9. #include <string.h>
  10. #include <stdbool.h>
  11. #include <unistd.h>
  12. //@+others
  13. //@+node:caminhante.20231212192422.1: ** struct uchar
  14. // A unicode string is a array of `struct uchar` objects, terminated with a 'struct uchar' with `.bytes == 0`.
  15. // A `\0` byte isn't considered a valid unicode char.
  16. struct uchar {
  17. uint8_t bytes;
  18. union {
  19. char chars[4];
  20. uint32_t ichars;
  21. };
  22. };
  23. //@+node:caminhante.20200309141158.2: ** uchar_valid
  24. // [ valid UTF8 byte sequence => true | false ]
  25. bool uchar_valid (char* source);
  26. //@+node:caminhante.20200309141158.3: ** uchar_bytes
  27. // [ valid UTF8 byte sequence =>
  28. // number of bytes occupied by a valid UTF8 byte sequence, between 1 and 4 | 0 ]
  29. size_t uchar_bytes (char* source);
  30. //@+node:caminhante.20200309141158.4: ** ustring_length
  31. // [ sequence of valid UTF8 byte sequences =>
  32. // number of valid consecutive UTF8 byte sequences, greater or equal than 1 | 0 ]
  33. size_t ustring_length (char* source);
  34. //@+node:caminhante.20200309141158.5: ** ustring_bytes
  35. // [ sequence of valid UTF8 byte sequences =>
  36. // the number of bytes occupied by valid consecutive UTF8 byte sequences,
  37. // greater or equal than 1 | 0 ]
  38. size_t ustring_bytes (char* source);
  39. //@+node:caminhante.20200309141158.6: ** cstring_bytes
  40. // [ sequence of `struct uchar` UTF byte sequences =>
  41. // number of bytes required to convert it to a conventional `\0` terminated `char` array ]
  42. size_t cstring_bytes (struct uchar* source);
  43. //@+node:caminhante.20200309141158.7: ** next_uchar
  44. // [ valid UTF8 byte sequence =>
  45. // a correctly initializated `struct uchar` object |
  46. // a `struct uchar` object with `.bytes == 0` ]
  47. struct uchar next_uchar (char* source);
  48. //@+node:caminhante.20200309141158.8: ** c_to_ustring
  49. // [ a `char` array containing potentially valid UTF8 text =>
  50. // a `struct uchar` array with all consecutive UTF8 valid byte sequences is written at `*destination` ]
  51. // You need to calc the needed `struct uchar` array length beforehand,
  52. // with `ustring_length(source)`
  53. void c_to_ustring (char* source, struct uchar* destination);
  54. //@+node:caminhante.20200309141158.9: ** u_to_cstring
  55. // [ a `struct uchar` array containing potentially valid UTF8 text =>
  56. // a `\0` terminated `char` array is written at `*destination` ]
  57. // You need to calc the needed `char` array length beforehand, summing all
  58. // `struct uchar` `.bytes` members plus 1 (accounting for a extra `\0` byte at the end
  59. void u_to_cstring (struct uchar* source, char* destination);
  60. //@+node:caminhante.20200309141158.10: ** uchar_puts
  61. // [ a `struct uchar` object =>
  62. // side effect: output UTF8 byte sequence at file descriptor, returns number of written bytes ]
  63. size_t uchar_puts (int fileno, struct uchar* uc);
  64. //@+node:caminhante.20200309141158.11: ** ustring_puts
  65. // [ sequence of `struct uchar` objects =>
  66. // side effect: output all UTF8 byte sequences at file descriptor, returns number of written bytes ]
  67. size_t ustring_puts (int fileno, struct uchar* ustring);
  68. //@-others
  69. #endif
  70. //@-leo