123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- //@+leo-ver=5-thin
- //@+node:caminhante.20200309141148.1: * @file ./utf8.c
- //@@tabwidth -2
- #include "utf8.h"
- //@+others
- //@+node:caminhante.20200309141148.3: ** static ascii_char
- static bool ascii_char (const char *source) {
- switch (source[0]) {
- case '\x09': case '\x0A': case '\x0D': return true;
- case '\x20'...'\x7E': return true;
- default: return false; }
- }
- //@+node:caminhante.20231212193346.1: ** static utf8_multibyte_ending
- static bool utf8_multibyte_ending (const char *source) {
- switch (source[0]) {
- case '\x80'...'\xBF': return true;
- default: return false; }
- }
- //@+node:caminhante.20231212193009.1: ** static utf8_2bytes_notoverlong
- static bool utf8_2bytes_notoverlong (const char *source) {
- switch (source[0]) {
- case '\xC2'...'\xDF': return utf8_multibyte_ending(source+1);
- default: return false; }
- }
- //@+node:caminhante.20231212193621.1: ** static utf8_3bytes_notoverlong
- static bool utf8_3bytes_notoverlong (const char *source) {
- switch (source[0]) {
- case '\xE0':
- switch (source[1]) {
- case '\xA0'...'\xBF': return utf8_multibyte_ending(source+2);
- default: return false; }
- default: return false; }
- }
- //@+node:caminhante.20231212193939.1: ** static utf8_3bytes
- static bool utf8_3bytes (const char *source) {
- switch (source[0]) {
- case '\xE1'...'\xEC': case '\xEE': case '\xEF':
- return utf8_multibyte_ending(source+1) && utf8_multibyte_ending(source+2);
- default: return false; }
- }
- //@+node:caminhante.20231212194252.1: ** static utf8_3bytes_notsurrogate
- static bool utf8_3bytes_notsurrogate (const char *source) {
- switch (source[0]) {
- case '\xED':
- switch (source[1]) {
- case '\x80'...'\x9F': return utf8_multibyte_ending(source+2);
- default: return false; }
- default: return false; }
- }
- //@+node:caminhante.20231212194457.1: ** static utf8_4bytes_planes1to3
- static bool utf8_4bytes_planes1to3 (const char *source) {
- switch (source[0]) {
- case '\xF0':
- switch (source[1]) {
- case '\x90'...'\xBF':
- return utf8_multibyte_ending(source+2) && utf8_multibyte_ending(source+3);
- default: return false; }
- default: return false; }
- }
- //@+node:caminhante.20231212194735.1: ** static utf8_4bytes_planes4to15
- static bool utf8_4bytes_planes4to15 (const char *source) {
- switch (source[0]) {
- case '\xF1'...'\xF3':
- return utf8_multibyte_ending(source+1) && utf8_multibyte_ending(source+2) && utf8_multibyte_ending(source+3);
- default: return false; }
- }
- //@+node:caminhante.20231212195030.1: ** static utf8_4bytes_plane16
- static bool utf8_4bytes_plane16 (const char *source) {
- switch (source[0]) {
- case '\xF4':
- switch (source[1]) {
- case '\x80'...'\x8F':
- return utf8_multibyte_ending(source+2) && utf8_multibyte_ending(source+3);
- default: return false; }
- default: return false; }
- }
- //@+node:caminhante.20200309141148.7: ** uchar_valid
- bool uchar_valid (char* source) {
- return ascii_char(source) ||
- utf8_2bytes_notoverlong(source) ||
- utf8_3bytes_notoverlong(source) || utf8_3bytes(source) || utf8_3bytes_notsurrogate(source) ||
- utf8_4bytes_planes1to3(source) || utf8_4bytes_planes4to15(source) || utf8_4bytes_plane16(source);
- }
- //@+node:caminhante.20200309141148.8: ** uchar_bytes
- size_t uchar_bytes (char* source) {
- if (ascii_char(source)) {return 1;}
- if (utf8_2bytes_notoverlong(source)) {return 2;}
- if (utf8_3bytes_notoverlong(source) || utf8_3bytes(source) || utf8_3bytes_notsurrogate(source)) {return 3;}
- if (utf8_4bytes_planes1to3(source) || utf8_4bytes_planes4to15(source) || utf8_4bytes_plane16(source)) {return 4;}
- return 0;
- }
- //@+node:caminhante.20200309141148.9: ** ustring_length
- size_t ustring_length (char* source) {
- size_t length = 0, a, p=0;
- do {
- a = uchar_bytes(source+p);
- p += a;
- if (a) {length++;}
- } while (a);
- return length;
- }
- //@+node:caminhante.20200309141148.10: ** ustring_bytes
- size_t ustring_bytes (char* source) {
- size_t a, p=0;
- do {
- a = uchar_bytes(source+p);
- p += a;
- } while (a);
- return p;
- }
- //@+node:caminhante.20200309141148.11: ** cstring_bytes
- size_t cstring_bytes (struct uchar* source) {
- size_t a = 0;
- while(source->bytes) {
- a += source->bytes;
- source++;
- }
- return a;
- }
- //@+node:caminhante.20200309141148.12: ** next_uchar
- struct uchar next_uchar (char* source) {
- size_t bytes = uchar_bytes(source);
- if (bytes == 0) {return (struct uchar){0};}
- struct uchar uc = (struct uchar){
- .bytes=bytes,
- .chars[0]=source[0],
- .chars[1]=(bytes>=2 ? source[1] : 0),
- .chars[2]=(bytes>=3 ? source[2] : 0),
- .chars[3]=(bytes==4 ? source[3] : 0)
- };
- return uc;
- }
- //@+node:caminhante.20200309141148.13: ** c_to_ustring
- void c_to_ustring (char* source, struct uchar* destination) {
- char *p = source;
- struct uchar uc, *us = destination;
- do {
- uc = next_uchar(p);
- p += uc.bytes;
- *us = uc;
- us ++;
- } while (uc.bytes);
- }
- //@+node:caminhante.20200309141148.14: ** u_to_cstring
- void u_to_cstring (struct uchar* source, char* destination) {
- struct uchar *us = source;
- char *p = destination;
- while (us->bytes) {
- memcpy(p,us->chars,us->bytes);
- p += us->bytes;
- us ++;
- }
- p[0] = '\0';
- }
- //@+node:caminhante.20200309141148.15: ** uchar_puts
- size_t uchar_puts (int fileno, struct uchar *uc) {
- return write(fileno, uc->chars, uc->bytes);
- }
- //@+node:caminhante.20200309141148.16: ** ustring_puts
- size_t ustring_puts (int fileno, struct uchar *ustring) {
- size_t written = 0;
- while (ustring->bytes != 0) {
- size_t a = uchar_puts(fileno,ustring);
- written +=a;
- if (a < ustring->bytes) {break;}
- ustring++;
- }
- return written;
- }
- //@-others
- //@-leo
|