localcharset.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. /* Determine a canonical name for the current locale's character encoding.
  2. Copyright (C) 2000-2003 Free Software Foundation, Inc.
  3. This program is free software; you can redistribute it and/or modify it
  4. under the terms of the GNU Library General Public License as published
  5. by the Free Software Foundation; either version 2, or (at your option)
  6. any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. Library General Public License for more details.
  11. You should have received a copy of the GNU Library General Public
  12. License along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
  14. USA. */
  15. /* Written by Bruno Haible <bruno@clisp.org>. */
  16. #ifdef HAVE_CONFIG_H
  17. # include <config.h>
  18. #endif
  19. /* Specification. */
  20. #include "localcharset.h"
  21. #if HAVE_STDDEF_H
  22. # include <stddef.h>
  23. #endif
  24. #include <stdio.h>
  25. #if HAVE_STRING_H
  26. # include <string.h>
  27. #else
  28. # include <strings.h>
  29. #endif
  30. #if HAVE_STDLIB_H
  31. # include <stdlib.h>
  32. #endif
  33. #if defined _WIN32 || defined __WIN32__
  34. # undef WIN32 /* avoid warning on mingw32 */
  35. # define WIN32
  36. #endif
  37. #if defined __EMX__
  38. /* Assume EMX program runs on OS/2, even if compiled under DOS. */
  39. # define OS2
  40. #endif
  41. #if !defined WIN32
  42. # if HAVE_LANGINFO_CODESET
  43. # include <langinfo.h>
  44. # else
  45. # if HAVE_SETLOCALE
  46. # include <locale.h>
  47. # endif
  48. # endif
  49. #elif defined WIN32
  50. # define WIN32_LEAN_AND_MEAN
  51. # include <windows.h>
  52. #endif
  53. #if defined OS2
  54. # define INCL_DOS
  55. # include <os2.h>
  56. #endif
  57. #if ENABLE_RELOCATABLE
  58. # include "relocatable.h"
  59. #else
  60. # define relocate(pathname) (pathname)
  61. #endif
  62. #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__
  63. /* Win32, OS/2, DOS */
  64. # define ISSLASH(C) ((C) == '/' || (C) == '\\')
  65. #endif
  66. #ifndef DIRECTORY_SEPARATOR
  67. # define DIRECTORY_SEPARATOR '/'
  68. #endif
  69. #ifndef ISSLASH
  70. # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
  71. #endif
  72. #ifdef HAVE_GETC_UNLOCKED
  73. # undef getc
  74. # define getc getc_unlocked
  75. #endif
  76. /* The following static variable is declared 'volatile' to avoid a
  77. possible multithread problem in the function get_charset_aliases. If we
  78. are running in a threaded environment, and if two threads initialize
  79. 'charset_aliases' simultaneously, both will produce the same value,
  80. and everything will be ok if the two assignments to 'charset_aliases'
  81. are atomic. But I don't know what will happen if the two assignments mix. */
  82. #if __STDC__ != 1
  83. # define volatile /* empty */
  84. #endif
  85. /* Pointer to the contents of the charset.alias file, if it has already been
  86. read, else NULL. Its format is:
  87. ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
  88. static const char * volatile charset_aliases;
  89. /* Return a pointer to the contents of the charset.alias file. */
  90. static const char *
  91. get_charset_aliases ()
  92. {
  93. const char *cp;
  94. cp = charset_aliases;
  95. if (cp == NULL)
  96. {
  97. #if !(defined VMS || defined WIN32)
  98. FILE *fp;
  99. const char *dir = relocate (LIBDIR);
  100. const char *base = "charset.alias";
  101. char *file_name;
  102. /* Concatenate dir and base into freshly allocated file_name. */
  103. {
  104. size_t dir_len = strlen (dir);
  105. size_t base_len = strlen (base);
  106. int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
  107. file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
  108. if (file_name != NULL)
  109. {
  110. memcpy (file_name, dir, dir_len);
  111. if (add_slash)
  112. file_name[dir_len] = DIRECTORY_SEPARATOR;
  113. memcpy (file_name + dir_len + add_slash, base, base_len + 1);
  114. }
  115. }
  116. if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
  117. /* Out of memory or file not found, treat it as empty. */
  118. cp = "";
  119. else
  120. {
  121. /* Parse the file's contents. */
  122. int c;
  123. char buf1[50+1];
  124. char buf2[50+1];
  125. char *res_ptr = NULL;
  126. size_t res_size = 0;
  127. size_t l1, l2;
  128. for (;;)
  129. {
  130. c = getc (fp);
  131. if (c == EOF)
  132. break;
  133. if (c == '\n' || c == ' ' || c == '\t')
  134. continue;
  135. if (c == '#')
  136. {
  137. /* Skip comment, to end of line. */
  138. do
  139. c = getc (fp);
  140. while (!(c == EOF || c == '\n'));
  141. if (c == EOF)
  142. break;
  143. continue;
  144. }
  145. ungetc (c, fp);
  146. if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
  147. break;
  148. l1 = strlen (buf1);
  149. l2 = strlen (buf2);
  150. if (res_size == 0)
  151. {
  152. res_size = l1 + 1 + l2 + 1;
  153. res_ptr = (char *) malloc (res_size + 1);
  154. }
  155. else
  156. {
  157. res_size += l1 + 1 + l2 + 1;
  158. res_ptr = (char *) realloc (res_ptr, res_size + 1);
  159. }
  160. if (res_ptr == NULL)
  161. {
  162. /* Out of memory. */
  163. res_size = 0;
  164. break;
  165. }
  166. strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
  167. strcpy (res_ptr + res_size - (l2 + 1), buf2);
  168. }
  169. fclose (fp);
  170. if (res_size == 0)
  171. cp = "";
  172. else
  173. {
  174. *(res_ptr + res_size) = '\0';
  175. cp = res_ptr;
  176. }
  177. }
  178. if (file_name != NULL)
  179. free (file_name);
  180. #else
  181. # if defined VMS
  182. /* To avoid the troubles of an extra file charset.alias_vms in the
  183. sources of many GNU packages, simply inline the aliases here. */
  184. /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
  185. "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
  186. section 10.7 "Handling Different Character Sets". */
  187. cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
  188. "ISO8859-2" "\0" "ISO-8859-2" "\0"
  189. "ISO8859-5" "\0" "ISO-8859-5" "\0"
  190. "ISO8859-7" "\0" "ISO-8859-7" "\0"
  191. "ISO8859-8" "\0" "ISO-8859-8" "\0"
  192. "ISO8859-9" "\0" "ISO-8859-9" "\0"
  193. /* Japanese */
  194. "eucJP" "\0" "EUC-JP" "\0"
  195. "SJIS" "\0" "SHIFT_JIS" "\0"
  196. "DECKANJI" "\0" "DEC-KANJI" "\0"
  197. "SDECKANJI" "\0" "EUC-JP" "\0"
  198. /* Chinese */
  199. "eucTW" "\0" "EUC-TW" "\0"
  200. "DECHANYU" "\0" "DEC-HANYU" "\0"
  201. "DECHANZI" "\0" "GB2312" "\0"
  202. /* Korean */
  203. "DECKOREAN" "\0" "EUC-KR" "\0";
  204. # endif
  205. # if defined WIN32
  206. /* To avoid the troubles of installing a separate file in the same
  207. directory as the DLL and of retrieving the DLL's directory at
  208. runtime, simply inline the aliases here. */
  209. cp = "CP936" "\0" "GBK" "\0"
  210. "CP1361" "\0" "JOHAB" "\0"
  211. "CP20127" "\0" "ASCII" "\0"
  212. "CP20866" "\0" "KOI8-R" "\0"
  213. "CP21866" "\0" "KOI8-RU" "\0"
  214. "CP28591" "\0" "ISO-8859-1" "\0"
  215. "CP28592" "\0" "ISO-8859-2" "\0"
  216. "CP28593" "\0" "ISO-8859-3" "\0"
  217. "CP28594" "\0" "ISO-8859-4" "\0"
  218. "CP28595" "\0" "ISO-8859-5" "\0"
  219. "CP28596" "\0" "ISO-8859-6" "\0"
  220. "CP28597" "\0" "ISO-8859-7" "\0"
  221. "CP28598" "\0" "ISO-8859-8" "\0"
  222. "CP28599" "\0" "ISO-8859-9" "\0"
  223. "CP28605" "\0" "ISO-8859-15" "\0";
  224. # endif
  225. #endif
  226. charset_aliases = cp;
  227. }
  228. return cp;
  229. }
  230. /* Determine the current locale's character encoding, and canonicalize it
  231. into one of the canonical names listed in config.charset.
  232. The result must not be freed; it is statically allocated.
  233. If the canonical name cannot be determined, the result is a non-canonical
  234. name. */
  235. #ifdef STATIC
  236. STATIC
  237. #endif
  238. const char *
  239. locale_charset ()
  240. {
  241. const char *codeset;
  242. const char *aliases;
  243. #if !(defined WIN32 || defined OS2)
  244. # if HAVE_LANGINFO_CODESET
  245. /* Most systems support nl_langinfo (CODESET) nowadays. */
  246. codeset = nl_langinfo (CODESET);
  247. # else
  248. /* On old systems which lack it, use setlocale or getenv. */
  249. const char *locale = NULL;
  250. /* But most old systems don't have a complete set of locales. Some
  251. (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
  252. use setlocale here; it would return "C" when it doesn't support the
  253. locale name the user has set. */
  254. # if HAVE_SETLOCALE && 0
  255. locale = setlocale (LC_CTYPE, NULL);
  256. # endif
  257. if (locale == NULL || locale[0] == '\0')
  258. {
  259. locale = getenv ("LC_ALL");
  260. if (locale == NULL || locale[0] == '\0')
  261. {
  262. locale = getenv ("LC_CTYPE");
  263. if (locale == NULL || locale[0] == '\0')
  264. locale = getenv ("LANG");
  265. }
  266. }
  267. /* On some old systems, one used to set locale = "iso8859_1". On others,
  268. you set it to "language_COUNTRY.charset". In any case, we resolve it
  269. through the charset.alias file. */
  270. codeset = locale;
  271. # endif
  272. #elif defined WIN32
  273. static char buf[2 + 10 + 1];
  274. /* Woe32 has a function returning the locale's codepage as a number. */
  275. sprintf (buf, "CP%u", GetACP ());
  276. codeset = buf;
  277. #elif defined OS2
  278. const char *locale;
  279. static char buf[2 + 10 + 1];
  280. ULONG cp[3];
  281. ULONG cplen;
  282. /* Allow user to override the codeset, as set in the operating system,
  283. with standard language environment variables. */
  284. locale = getenv ("LC_ALL");
  285. if (locale == NULL || locale[0] == '\0')
  286. {
  287. locale = getenv ("LC_CTYPE");
  288. if (locale == NULL || locale[0] == '\0')
  289. locale = getenv ("LANG");
  290. }
  291. if (locale != NULL && locale[0] != '\0')
  292. {
  293. /* If the locale name contains an encoding after the dot, return it. */
  294. const char *dot = strchr (locale, '.');
  295. if (dot != NULL)
  296. {
  297. const char *modifier;
  298. dot++;
  299. /* Look for the possible @... trailer and remove it, if any. */
  300. modifier = strchr (dot, '@');
  301. if (modifier == NULL)
  302. return dot;
  303. if (modifier - dot < sizeof (buf))
  304. {
  305. memcpy (buf, dot, modifier - dot);
  306. buf [modifier - dot] = '\0';
  307. return buf;
  308. }
  309. }
  310. /* Resolve through the charset.alias file. */
  311. codeset = locale;
  312. }
  313. else
  314. {
  315. /* OS/2 has a function returning the locale's codepage as a number. */
  316. if (DosQueryCp (sizeof (cp), cp, &cplen))
  317. codeset = "";
  318. else
  319. {
  320. sprintf (buf, "CP%u", cp[0]);
  321. codeset = buf;
  322. }
  323. }
  324. #endif
  325. if (codeset == NULL)
  326. /* The canonical name cannot be determined. */
  327. codeset = "";
  328. /* Resolve alias. */
  329. for (aliases = get_charset_aliases ();
  330. *aliases != '\0';
  331. aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
  332. if (strcmp (codeset, aliases) == 0
  333. || (aliases[0] == '*' && aliases[1] == '\0'))
  334. {
  335. codeset = aliases + strlen (aliases) + 1;
  336. break;
  337. }
  338. /* Don't return an empty string. GNU libc and GNU libiconv interpret
  339. the empty string as denoting "the locale's character encoding",
  340. thus GNU libiconv would call this function a second time. */
  341. if (codeset[0] == '\0')
  342. codeset = "ASCII";
  343. return codeset;
  344. }