wcwidth.c 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. /*
  2. * This is an implementation of wcwidth() and wcswidth() (defined in
  3. * IEEE Std 1002.1-2001) for Unicode.
  4. *
  5. * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html
  6. * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html
  7. *
  8. * In fixed-width output devices, Latin characters all occupy a single
  9. * "cell" position of equal width, whereas ideographic CJK characters
  10. * occupy two such cells. Interoperability between terminal-line
  11. * applications and (teletype-style) character terminals using the
  12. * UTF-8 encoding requires agreement on which character should advance
  13. * the cursor by how many cell positions. No established formal
  14. * standards exist at present on which Unicode character shall occupy
  15. * how many cell positions on character terminals. These routines are
  16. * a first attempt of defining such behavior based on simple rules
  17. * applied to data provided by the Unicode Consortium.
  18. *
  19. * For some graphical characters, the Unicode standard explicitly
  20. * defines a character-cell width via the definition of the East Asian
  21. * FullWidth (F), Wide (W), Half-width (H), and Narrow (Na) classes.
  22. * In all these cases, there is no ambiguity about which width a
  23. * terminal shall use. For characters in the East Asian Ambiguous (A)
  24. * class, the width choice depends purely on a preference of backward
  25. * compatibility with either historic CJK or Western practice.
  26. * Choosing single-width for these characters is easy to justify as
  27. * the appropriate long-term solution, as the CJK practice of
  28. * displaying these characters as double-width comes from historic
  29. * implementation simplicity (8-bit encoded characters were displayed
  30. * single-width and 16-bit ones double-width, even for Greek,
  31. * Cyrillic, etc.) and not any typographic considerations.
  32. *
  33. * Much less clear is the choice of width for the Not East Asian
  34. * (Neutral) class. Existing practice does not dictate a width for any
  35. * of these characters. It would nevertheless make sense
  36. * typographically to allocate two character cells to characters such
  37. * as for instance EM SPACE or VOLUME INTEGRAL, which cannot be
  38. * represented adequately with a single-width glyph. The following
  39. * routines at present merely assign a single-cell width to all
  40. * neutral characters, in the interest of simplicity. This is not
  41. * entirely satisfactory and should be reconsidered before
  42. * establishing a formal standard in this area. At the moment, the
  43. * decision which Not East Asian (Neutral) characters should be
  44. * represented by double-width glyphs cannot yet be answered by
  45. * applying a simple rule from the Unicode database content. Setting
  46. * up a proper standard for the behavior of UTF-8 character terminals
  47. * will require a careful analysis not only of each Unicode character,
  48. * but also of each presentation form, something the author of these
  49. * routines has avoided to do so far.
  50. *
  51. * http://www.unicode.org/unicode/reports/tr11/
  52. *
  53. * Markus Kuhn -- 2007-05-26 (Unicode 5.0)
  54. *
  55. * Permission to use, copy, modify, and distribute this software
  56. * for any purpose and without fee is hereby granted. The author
  57. * disclaims all warranties with regard to this software.
  58. *
  59. * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
  60. */
  61. #include <wchar.h>
  62. #include "putty.h" /* for prototypes */
  63. struct interval {
  64. unsigned int first;
  65. unsigned int last;
  66. };
  67. /* auxiliary function for binary search in interval table */
  68. static bool bisearch(unsigned int ucs, const struct interval *table, int max) {
  69. int min = 0;
  70. int mid;
  71. if (ucs < table[0].first || ucs > table[max].last)
  72. return false;
  73. while (max >= min) {
  74. mid = (min + max) / 2;
  75. if (ucs > table[mid].last)
  76. min = mid + 1;
  77. else if (ucs < table[mid].first)
  78. max = mid - 1;
  79. else
  80. return true;
  81. }
  82. return false;
  83. }
  84. /* The following two functions define the column width of an ISO 10646
  85. * character as follows:
  86. *
  87. * - The null character (U+0000) has a column width of 0.
  88. *
  89. * - Other C0/C1 control characters and DEL will lead to a return
  90. * value of -1.
  91. *
  92. * - Non-spacing and enclosing combining characters (general
  93. * category code Mn or Me in the Unicode database) have a
  94. * column width of 0.
  95. *
  96. * - SOFT HYPHEN (U+00AD) has a column width of 1.
  97. *
  98. * - Other format characters (general category code Cf in the Unicode
  99. * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
  100. *
  101. * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
  102. * have a column width of 0.
  103. *
  104. * - Spacing characters in the East Asian Wide (W) or East Asian
  105. * Full-width (F) category as defined in Unicode Technical
  106. * Report #11 have a column width of 2.
  107. *
  108. * - All remaining characters (including all printable
  109. * ISO 8859-1 and WGL4 characters, Unicode control characters,
  110. * etc.) have a column width of 1.
  111. *
  112. * This implementation assumes that wchar_t characters are encoded
  113. * in ISO 10646.
  114. */
  115. int mk_wcwidth(unsigned int ucs)
  116. {
  117. /* sorted list of non-overlapping intervals of non-spacing characters */
  118. static const struct interval combining[] = {
  119. #include "unicode/nonspacing_chars.h"
  120. };
  121. /* A sorted list of intervals of double-width characters */
  122. static const struct interval wide[] = {
  123. #include "unicode/wide_chars.h"
  124. };
  125. /* test for 8-bit control characters */
  126. if (ucs == 0)
  127. return 0;
  128. if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
  129. return -1;
  130. /* binary search in table of non-spacing characters */
  131. if (bisearch(ucs, combining,
  132. sizeof(combining) / sizeof(struct interval) - 1))
  133. return 0;
  134. /* if we arrive here, ucs is not a combining or C0/C1 control character */
  135. /* binary search in table of double-width characters */
  136. if (bisearch(ucs, wide,
  137. sizeof(wide) / sizeof(struct interval) - 1))
  138. return 2;
  139. /* normal width character */
  140. return 1;
  141. }
  142. int mk_wcswidth(const unsigned int *pwcs, size_t n)
  143. {
  144. int w, width = 0;
  145. for (;*pwcs && n-- > 0; pwcs++)
  146. if ((w = mk_wcwidth(*pwcs)) < 0)
  147. return -1;
  148. else
  149. width += w;
  150. return width;
  151. }
  152. /*
  153. * The following functions are the same as mk_wcwidth() and
  154. * mk_wcswidth(), except that spacing characters in the East Asian
  155. * Ambiguous (A) category as defined in Unicode Technical Report #11
  156. * have a column width of 2. This variant might be useful for users of
  157. * CJK legacy encodings who want to migrate to UCS without changing
  158. * the traditional terminal character-width behaviour. It is not
  159. * otherwise recommended for general use.
  160. */
  161. int mk_wcwidth_cjk(unsigned int ucs)
  162. {
  163. /* A sorted list of intervals of ambiguous width characters */
  164. static const struct interval ambiguous[] = {
  165. #include "unicode/ambiguous_wide_chars.h"
  166. };
  167. /* binary search in table of non-spacing characters */
  168. if (bisearch(ucs, ambiguous,
  169. sizeof(ambiguous) / sizeof(struct interval) - 1))
  170. return 2;
  171. return mk_wcwidth(ucs);
  172. }
  173. int mk_wcswidth_cjk(const unsigned int *pwcs, size_t n)
  174. {
  175. int w, width = 0;
  176. for (;*pwcs && n-- > 0; pwcs++)
  177. if ((w = mk_wcwidth_cjk(*pwcs)) < 0)
  178. return -1;
  179. else
  180. width += w;
  181. return width;
  182. }