charset.h 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. /*
  2. * charset.h - header file for general character set conversion
  3. * routines.
  4. */
  5. #ifndef charset_charset_h
  6. #define charset_charset_h
  7. #include <stddef.h>
  8. /*
  9. * Enumeration that lists all the multibyte or single-byte
  10. * character sets known to this library.
  11. */
  12. typedef enum {
  13. CS_NONE, /* used for reporting errors, etc */
  14. CS_ISO8859_1,
  15. CS_ISO8859_1_X11, /* X font encoding with VT100 glyphs */
  16. CS_ISO8859_2,
  17. CS_ISO8859_3,
  18. CS_ISO8859_4,
  19. CS_ISO8859_5,
  20. CS_ISO8859_6,
  21. CS_ISO8859_7,
  22. CS_ISO8859_8,
  23. CS_ISO8859_9,
  24. CS_ISO8859_10,
  25. CS_ISO8859_11,
  26. CS_ISO8859_13,
  27. CS_ISO8859_14,
  28. CS_ISO8859_15,
  29. CS_ISO8859_16,
  30. CS_CP437,
  31. CS_CP850,
  32. CS_CP852,
  33. CS_CP866,
  34. CS_CP1250,
  35. CS_CP1251,
  36. CS_CP1252,
  37. CS_CP1253,
  38. CS_CP1254,
  39. CS_CP1255,
  40. CS_CP1256,
  41. CS_CP1257,
  42. CS_CP1258,
  43. CS_KOI8_R,
  44. CS_KOI8_U,
  45. CS_MAC_ROMAN,
  46. CS_MAC_TURKISH,
  47. CS_MAC_CROATIAN,
  48. CS_MAC_ICELAND,
  49. CS_MAC_ROMANIAN,
  50. CS_MAC_GREEK,
  51. CS_MAC_CYRILLIC,
  52. CS_MAC_THAI,
  53. CS_MAC_CENTEURO,
  54. CS_MAC_SYMBOL,
  55. CS_MAC_DINGBATS,
  56. CS_MAC_ROMAN_OLD,
  57. CS_MAC_CROATIAN_OLD,
  58. CS_MAC_ICELAND_OLD,
  59. CS_MAC_ROMANIAN_OLD,
  60. CS_MAC_GREEK_OLD,
  61. CS_MAC_CYRILLIC_OLD,
  62. CS_MAC_UKRAINE,
  63. CS_MAC_VT100,
  64. CS_MAC_VT100_OLD,
  65. CS_VISCII,
  66. CS_HP_ROMAN8,
  67. CS_DEC_MCS,
  68. CS_UTF8
  69. } charset_t;
  70. typedef struct {
  71. unsigned long s0;
  72. } charset_state;
  73. /*
  74. * Routine to convert a MB/SB character set to Unicode.
  75. *
  76. * This routine accepts some number of bytes, updates a state
  77. * variable, and outputs some number of Unicode characters. There
  78. * are no guarantees. You can't even guarantee that at most one
  79. * Unicode character will be output per byte you feed in; for
  80. * example, suppose you're reading UTF-8, you've seen E1 80, and
  81. * then you suddenly see FE. Now you need to output _two_ error
  82. * characters - one for the incomplete sequence E1 80, and one for
  83. * the completely invalid UTF-8 byte FE.
  84. *
  85. * Returns the number of wide characters output; will never output
  86. * more than the size of the buffer (as specified on input).
  87. * Advances the `input' pointer and decrements `inlen', to indicate
  88. * how far along the input string it got.
  89. *
  90. * The sequence of `errlen' wide characters pointed to by `errstr'
  91. * will be used to indicate a conversion error. If `errstr' is
  92. * NULL, `errlen' will be ignored, and the library will choose
  93. * something sensible to do on its own. For Unicode, this will be
  94. * U+FFFD (REPLACEMENT CHARACTER).
  95. */
  96. int charset_to_unicode(const char **input, int *inlen,
  97. wchar_t *output, int outlen,
  98. int charset, charset_state *state,
  99. const wchar_t *errstr, int errlen);
  100. /*
  101. * Routine to convert Unicode to an MB/SB character set.
  102. *
  103. * This routine accepts some number of Unicode characters, updates
  104. * a state variable, and outputs some number of bytes.
  105. *
  106. * Returns the number of bytes characters output; will never output
  107. * more than the size of the buffer (as specified on input), and
  108. * will never output a partial MB character. Advances the `input'
  109. * pointer and decrements `inlen', to indicate how far along the
  110. * input string it got.
  111. *
  112. * The sequence of `errlen' characters pointed to by `errstr' will
  113. * be used to indicate a conversion error. If `errstr' is NULL,
  114. * `errlen' will be ignored, and the library will choose something
  115. * sensible to do on its own (which will vary depending on the
  116. * output charset).
  117. */
  118. int charset_from_unicode(const wchar_t **input, int *inlen,
  119. char *output, int outlen,
  120. int charset, charset_state *state,
  121. const char *errstr, int errlen);
  122. /*
  123. * Convert X11 encoding names to and from our charset identifiers.
  124. */
  125. const char *charset_to_xenc(int charset);
  126. int charset_from_xenc(const char *name);
  127. /*
  128. * Convert MIME encoding names to and from our charset identifiers.
  129. */
  130. const char *charset_to_mimeenc(int charset);
  131. int charset_from_mimeenc(const char *name);
  132. /*
  133. * Convert our own encoding names to and from our charset
  134. * identifiers.
  135. */
  136. const char *charset_to_localenc(int charset);
  137. int charset_from_localenc(const char *name);
  138. int charset_localenc_nth(int n);
  139. /*
  140. * Convert Mac OS script/region/font to our charset identifiers.
  141. */
  142. int charset_from_macenc(int script, int region, int sysvers,
  143. const char *fontname);
  144. #endif /* charset_charset_h */