utf.c 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. /*
  2. * UTF
  3. */
  4. #include "utf.h"
  5. /*
  6. * These routines avoid using arithmetic operations, relying
  7. * on bit operations and tests for equality for the conversions.
  8. *
  9. * Arithmetic is used only in the book-keeping for the *n* versions.
  10. *
  11. */
  12. /*
  13. * convnutf() returns the number of bytes of valid utf
  14. * decoded at p, or 0 if p does not point to a valid utf
  15. * encoding. It additionally puts the decoded value into
  16. * the integer reference z, or 0 if utf is not valid.
  17. *
  18. * If the sequence exceeds n bytes, and hasn't yet validated,
  19. * 0 is returned, and z is set to the number of missing bytes.
  20. *
  21. * convnucode() is the inverse operation, taking a unicode
  22. * codepoint c, and storing the utf8 representation in (at most)
  23. * n bytes of p.
  24. *
  25. * The number of bytes used is the return value. If the sequence
  26. * does not fit in the buffer, a negative value is returned, representing
  27. * the number of bytes short of a valid sequence the buffer is.
  28. *
  29. */
  30. int
  31. convnutf(byte *p, int *z, int n)
  32. {
  33. int c,u,r;
  34. r=1;
  35. if(r>n)
  36. goto prem;
  37. u=c=*p++;
  38. if (0==(c & 0200))
  39. goto valid; /* 0bbbbbbb : valid ascii */
  40. u&=~0200;
  41. if (0==((c<<=1) & 0200))
  42. goto invalid; /* 10bbbbbb : error (aux byte in worng place) */
  43. u&=~0100;
  44. r++;
  45. if (0==((c<<=1) & 0200))
  46. goto two; /* 110bbbbb : start of two-byte sequence */
  47. u&=~040;
  48. r++;
  49. if (0==((c<<=1) & 0200))
  50. goto three; /* 1110bbbb : start of three-byte sequence */
  51. u&=~020;
  52. r++;
  53. if (0==((c<<=1) & 0200))
  54. goto four; /* 11110bbb : start of four-byte sequence */
  55. goto invalid; /* 11111bbb : invalid range (unless we want five..) */
  56. four:
  57. if(r>n)
  58. goto prem;
  59. if (0==((c=*p++) & 0200))
  60. goto invalid; /* 0bbbbbbb : ascii character in wrong place */
  61. u <<= 6;
  62. u|=(c&~0200);
  63. if (0==((c<<=1) & 0200))
  64. goto three; /* 10bbbbbb : aux byte in expected place */
  65. goto invalid; /* 11bbbbbb : start byte in wrong place */
  66. three:
  67. if(r>n)
  68. goto prem;
  69. if (0==((c=*p++) & 0200))
  70. goto invalid; /* 0bbbbbbb : ascii character in wrong place */
  71. u <<= 6;
  72. u|=(c&~0200);
  73. if (0==((c<<=1) & 0200))
  74. goto two; /* 10bbbbbb : aux byte in expected place */
  75. goto invalid; /* 11bbbbbb : start byte in wrong place */
  76. two:
  77. if(r>n)
  78. goto prem;
  79. if (0==((c=*p++) & 0200))
  80. goto invalid; /* 0bbbbbbb : ascii character in wrong place */
  81. u <<= 6;
  82. u|=(c&~0200);
  83. if (0==((c<<=1) & 0200))
  84. goto valid; /* 10bbbbbb : final aux byte */
  85. goto invalid; /* 11bbbbbb : start byte in wrong place */
  86. prem:
  87. *z=(r-n);
  88. return 0;
  89. invalid:
  90. *z=0;
  91. return 0;
  92. valid:
  93. *z=u;
  94. return r;
  95. }
  96. int
  97. convnucode(int c, byte *p, int n)
  98. {
  99. *p='\0';
  100. if(0==(c>>7))
  101. goto ascii;
  102. if(0==(c>>11))
  103. goto twobytes;
  104. if(0==(c>>16))
  105. goto threebytes;
  106. if(0==(c>>21))
  107. goto fourbytes;
  108. /* too big */
  109. return -5;
  110. ascii:
  111. if(n<1)
  112. return n-1;
  113. p+=1;
  114. *p--='\0';
  115. *p=(byte)c;
  116. return 1;
  117. twobytes:
  118. if(n<2)
  119. return n-2;
  120. p+=2;
  121. *p--='\0';
  122. *p--=(byte)((0200)|(c&077));
  123. *p=(byte)((0300)|((c>>6)&077));
  124. return 2;
  125. threebytes:
  126. if(n<3)
  127. return n-3;
  128. p+=3;
  129. *p--='\0';
  130. *p--=(byte)((0200)|(c&077));
  131. *p--=(byte)((0200)|((c>>=6)&077));
  132. *p=(byte)((0340)|((c>>6)&077));
  133. return 3;
  134. fourbytes:
  135. if(n<4)
  136. return n-4;
  137. p+=4;
  138. *p--='\0';
  139. *p--=(byte)((0200)|(c&077));
  140. *p--=(byte)((0200)|((c>>=6)&077));
  141. *p--=(byte)((0200)|((c>>=6)&077));
  142. *p=(byte)((0360)|((c>>6)&077));
  143. return 4;
  144. }
  145. /* dangerous! ...
  146. byte *
  147. utf8string(int *u, byte *b)
  148. {
  149. while(*u)
  150. b+=convucode(*u++,b);
  151. *b++='\0';
  152. return b;
  153. }
  154. int *
  155. ucodestring(byte *b, int *u)
  156. {
  157. while(*b)
  158. b+=convutf(b,u++);
  159. *u++='\0';
  160. return u;
  161. }
  162. ... */
  163. int
  164. utf8nstring(int *u, byte *b, unsigned int n)
  165. {
  166. byte *a;
  167. a=b;
  168. while(n--&&*u)
  169. b+=convucode(*u++,b);
  170. *b='\0';
  171. return b-a;
  172. }
  173. int
  174. ucodenstring(byte *b, int *u, unsigned int n)
  175. {
  176. int* v;
  177. v=u;
  178. while(n--&&*b)
  179. b+=convutf(b,u++);
  180. *u='\0';
  181. return u-v;
  182. }