fractal.arcana
/
sti
forked de yzziizzy/sti


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
							#ifndef __sti__utf_h__
#define __sti__utf_h__

/////////////////////////////////////////////////////////////////////////
//
//         NOTE:
//
//  This is a basic, general utf8/32 string handling library.
//  It is written for speed and efficiency before "correctness".
//  It will pass through any characters it does not know about.
//  It is aimed at *reasonable* support for *most* modern languages.
//  It IS NOT a full-featured, complete, correct unicode library. 
//  It DOES NOT handle every intricacy of every language, potentially
//     including your favorite language.
//  It makes no attempt, and never will, to support special features of
//     ancient, obscure, or unusual languages.
//  Character properties above FFFF are completely unsupported.
//  
/////////////////////////////////////////////////////////////////////////

// https://www.unicode.org/reports/tr44/#UnicodeData.txt


#include <string.h>

// utf8 functions

/*
Naming Convention
Mostly derived from string.h

str [n|k] r? case? {operation} [8|32] p? _inplace?

str: consistent prefix
n: limited by bytes
k: limited by codepoints (characters)
r: reverse; starts at the end of the string
case: case-insensitive
operation: see below
8: operates on utf8
32: operates on utf32
p: (utf8 only) accepts a pointer to a potentially multibyte encoded sequence instead of a 32-bit codepoint
_inplace: modifies the source buffer

Operations:
	cat: append onto existing string; strcat
	chr: search for a character; strchr
	cmp: compare; strcmp
	colwsp: collapse whitespace. All sequences of whitespace are converted into a single copy of the provided character
	cpy: copy; strcpy
	cspn: return length of inverse prefix substring (postfix for 'r' version)
	dup: duplicate into newly allocated memory; strdup
	dupa: duplicate onto the stack using alloca
	len: calculate length; strlen
	pbrk: search for the first of any of a set of characters
	rev: reverse the string
	spn: return length of prefix substring (postfix for 'r' version)
	skip: search for the first character not in a set of characters (strspn, but returns a pointer)
	str: search for a substring; strstr
	tolower: convert the string to lowercase
	toupper: convert the string to uppercase
	ltrim: remove a prefix of characters in a set from the beginning of the string
	rtrim: remove a suffix of characters in a set from the end of the string
	trim: remove a sequence characters in a set from the beginning and end of the string
	
	
Arguments:
	char* c8; a pointer to a single utf8 character, up to 4 bytes
	uint32_t c32; a unicode codepoint (utf32)
	size_t blen; operation limit, in bytes
	size_t clen; operation limit, in codepoints
	
"n" versions should not leave behind mutilated multi-byte characters if they hit the limit in the middle of one.
*/

// spn functions return the number of characters spanned


// TODO: char* strnchr(const char* s, int c, size_t n); // returns a pointer to the found char, or NULL
// TODO: size_t strncspn(const char* s, const char* reject, size_t n);
// TODO: char* strnpbrk(const char* s, const char* accept, size_t n);
// TODO: size_t strnspn(const char* s, const char* accept, size_t n);
// TODO: char* strrev(char* s); // returns s
// TODO: char* strnrev(const char* s, size_t n); // returns s

// TODO: char* strrpbrk(const char* s, const char* accept); // returns a pointer to the first match character
// TODO: size_t strrcspn(const char* s, const char* reject);
// TODO: size_t strrspn(const char* s, const char* accept); 
// TODO: char* strnrpbrk(const char* s, const char* accept, size_t n); // returns a pointer to the first match character
// TODO: size_t strnrcspn(const char* s, const char* reject, size_t n);
// TODO: size_t strnrspn(const char* s, const char* accept, size_t n);

// TODO: char* strltrim(char* s, const char* charset); // moves chars to left, returns s
// TODO: char* strnltrim(char* s, const char* charset, size_t n); // moves chars to left, returns s
// TODO: char* strrtrim(char* s, const char* charset); // moves the null byte to the left, returns s
// TODO: char* strtrim(char* s, const char* charset); // both above, returns s
// TODO: char* strcolwsp(char* s); // does not trim, returns s
// TODO: char* strncolwsp(char* s, size_t n); // does not trim, returns s

// TODO: char* strcolwsptrim(char* s); // also trims, returns s
// TODO: char* strcapwords(char* s); // capitalize the first letter following whitespace, and the beginning of the string, returns s
// TODO: char* strcapsentences(char* s); //  capitalize the first letter following terminal punctuation, and the beginning of the string, returns s
// TODO: char* strncapwords(char* s, size_t n) -- capitalize the first letter following whitespace, and the beginning of the string, returns s
// TODO: char* strncapsentences(char* s, size_t n) -- capitalize the first letter following terminal punctuation, and the beginning of the string, returns s


// replace functions should be optimized for very long src strings and short needle and replacement strings 
//  strreplace(char* src, char* dst, char* needle, char* replacement) // src != dst, replaces all occurrences of needle with replacement
//  strnreplace(char* src, char* dst, char* needle, char* replacement, size_t dst_alloc_len) // src != dst
//  strreplace_inplace(char* src, char* needle, char* replacement) // modifies src in-place. (must be done in reverse if strlen(replacement) > strlen(needle), counting needles on the forward pass)
//  strnreplace_inplace(char* src, char* needle, char* replacement, size_t src_alloc_len) // modifies src in-place. (must be done in reverse if strlen(replacement) > strlen(needle), counting needles on the forward pass)

// format numbers, bytes, money

// limited ('n') versions stop at the null byte or the limit, whichever is first

// edge cases:
//   null pointers result in segfault
//   empty strings
//   1-char strings
//   strings limited at 0
//   strings limited at 1
//   limited strings that hit the null byte


// returns the number of characters in a utf8 string
size_t charlen8(const char* u8);

// returns a new buffer, caller must free, or NULL when the string is malformed
uint32_t* utf8_to_utf32(uint8_t* u8, size_t* outLen);


// It is the caller's responsibility to provide at least 4 bytes of output memory
// returns the number of bytes used.
int utf32_to_utf8(uint32_t u32, uint8_t* u8_out);

// returns the number of bytes needed to encode this codepoint in utf8
int utf8_bytes_needed(uint32_t u32);

// byte length of a single utf8 character, with subsequent btye format verification and null checks
int utf8_char_size(const char* u8);

// returns 1 if there are multi-byte sequences, 0 otherwise
int utf8_has_multibyte(const uint8_t* u8);


// size_t strcspn8(const char* a, const char* b);
// char* strpbrk8(const char* a, const char* b);
// char* strtok8_r(char* s, const char* delim, char** saveptr);
// char* strnstr8(const char* a, const char* b, size_t len) { return strnstr(a, b, len); }


// char* strcasestr8(const char* a, const char* b);


// some normal string functions are utf8 safe. *8 versions are provided here so you don't have
//   to remember which ones are which


inline static size_t strlen8(const char* s) { return strlen(s); }

inline static char* strcat8(char* dst, const char* src) { return strcat(dst, src); }
inline static char* strncat8(char* dst, const char* src, size_t len) { return strncat(dst, src, len); }
              char* strkcat8(char* dst, const char* src, size_t clen);


// returns NULL on not found or if codepoint is invalid
char* strchr8(const char* s, uint32_t c32);

// c8 is a pointer to a single utf8 character, up to 4 bytes
// returns NULL on not found or if codepoint is invalid
char* strchr8p(const char* s, const char* c8);
char* strrchr8(const char* s, uint32_t c32);
char* strrchr8p(const char* s, const char* c8);

char* strnchr8(const char* s, uint32_t c32, size_t blen);


inline static char* strcpy8(char* dst, const char* src) { return strcpy(dst, src); }
inline static char* strncpy8(char* dst, const char* src, size_t blen) { return strncpy(dst, src, blen); }
              char* strkcpy8(char* dst, const char* src, size_t clen);


inline static int   strcmp8(const char* a, const char* b) { return strcmp(a, b); }
inline static int   strncmp8(const char* a, const char* b, size_t len) { return strncmp(a, b, len); }
inline static char* strstr8(const char* a, const char* b) { return strstr(a, b); }
inline static char* strdup8(const char* const s) { return strdup(s); }
inline static char* strndup8(const char* const s, size_t len) { return strndup(s, len); }
// inline static char* strtok_r8(const char* const s, size_t len) { return strndup(s, len); }


// strtok intentionally not implemented
inline static char* strtok8(char* s, const char* delim) {
	(void)s;
	(void)delim;
	*((int*)0) = 0xBadBad; // segfault on purpose because non-reentrant fns are bad. use strtok_r. 
	return NULL;
}


// why even bother with utf16? all downsides, no upsides.


// utf32 functions


// in bytes, not including (4-byte) null terminator
size_t strlen32(const uint32_t* const s);

// in characters
size_t charlen32(const uint32_t* const s);


uint32_t* strcat32(uint32_t* dst, const uint32_t* src);
uint32_t* strncat32(uint32_t* dst, const uint32_t* src, size_t len);
uint32_t* strcpy32(uint32_t* dst, const uint32_t* src);
uint32_t* strncpy32(uint32_t* dst, const uint32_t* src, size_t len);
uint32_t* strchr32(const uint32_t* s, uint32_t c);
uint32_t* strrchr32(const uint32_t* s, uint32_t c);
uint32_t* strchrnul32(uint32_t* s, uint32_t c);
int       strcmp32(const uint32_t* a, const uint32_t* b);
int       strncmp32(const uint32_t* a, const uint32_t* b, size_t len);
size_t    strspn32(const uint32_t* s, const uint32_t* accept);
size_t    strcspn32(const uint32_t* s, const uint32_t* reject);
// uint32_t* strpbrk32(const uint32_t* a, const uint32_t* b);
// uint32_t* strstr32(const uint32_t* a, const uint32_t* b);
// uint32_t* strnstr32(const uint32_t* a, const uint32_t* b);
uint32_t* strdup32(const uint32_t* const s);
// uint32_t* strndup32(const uint32_t* const s);
// uint32_t* strtok_r32(uint32_t* s, const uint32_t* delim, uint32_t** saveptr);


// strtok intentionally not implemented
inline static uint32_t* strtok32(uint32_t* s, const uint32_t* delim) {
	(void)s;
	(void)delim;
	*((int*)0) = 0xBadBad; // segfault on purpose because non-reentrant fns are bad. use strtok_r. 
	return NULL;
}


// http://www.unicode.org/reports/tr44/#UnicodeData.txt

// TODO:
//   toupper/tolower for whole strings
//   Capitalize words
//   Character class info
//   Strip emoji
//   Convert all emoji to random list of supplied emoji
//   printf
//   trim

#endif // __sti__utf_h__