localcharset.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. /* Determine a canonical name for the current locale's character encoding.
  2. Copyright (C) 2000-2006, 2008-2009 Free Software Foundation, Inc.
  3. This program is free software; you can redistribute it and/or modify it
  4. under the terms of the GNU Library General Public License as published
  5. by the Free Software Foundation; either version 2, or (at your option)
  6. any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. Library General Public License for more details.
  11. You should have received a copy of the GNU Library General Public
  12. License along with this program; if not, write to the Free Software
  13. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
  14. USA. */
  15. /* Written by Bruno Haible <bruno@clisp.org>. */
  16. #include <config.h>
  17. /* Specification. */
  18. #include "localcharset.h"
  19. #include <stddef.h>
  20. #include <stdio.h>
  21. #include <string.h>
  22. #include <stdlib.h>
  23. #if defined __APPLE__ && defined __MACH__ && HAVE_LANGINFO_CODESET
  24. # define DARWIN7 /* Darwin 7 or newer, i.e. MacOS X 10.3 or newer */
  25. #endif
  26. #if defined _WIN32 || defined __WIN32__
  27. # define WIN32_NATIVE
  28. #endif
  29. #if defined __EMX__
  30. /* Assume EMX program runs on OS/2, even if compiled under DOS. */
  31. # ifndef OS2
  32. # define OS2
  33. # endif
  34. #endif
  35. #if !defined WIN32_NATIVE
  36. # if HAVE_LANGINFO_CODESET
  37. # include <langinfo.h>
  38. # else
  39. # if 0 /* see comment below */
  40. # include <locale.h>
  41. # endif
  42. # endif
  43. # ifdef __CYGWIN__
  44. # define WIN32_LEAN_AND_MEAN
  45. # include <windows.h>
  46. # endif
  47. #elif defined WIN32_NATIVE
  48. # define WIN32_LEAN_AND_MEAN
  49. # include <windows.h>
  50. #endif
  51. #if defined OS2
  52. # define INCL_DOS
  53. # include <os2.h>
  54. #endif
  55. #if ENABLE_RELOCATABLE
  56. # include "relocatable.h"
  57. #else
  58. # define relocate(pathname) (pathname)
  59. #endif
  60. /* Get LIBDIR. */
  61. #ifndef LIBDIR
  62. # include "configmake.h"
  63. #endif
  64. #if defined _WIN32 || defined __WIN32__ || defined __CYGWIN__ || defined __EMX__ || defined __DJGPP__
  65. /* Win32, Cygwin, OS/2, DOS */
  66. # define ISSLASH(C) ((C) == '/' || (C) == '\\')
  67. #endif
  68. #ifndef DIRECTORY_SEPARATOR
  69. # define DIRECTORY_SEPARATOR '/'
  70. #endif
  71. #ifndef ISSLASH
  72. # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
  73. #endif
  74. #if HAVE_DECL_GETC_UNLOCKED
  75. # undef getc
  76. # define getc getc_unlocked
  77. #endif
  78. /* The following static variable is declared 'volatile' to avoid a
  79. possible multithread problem in the function get_charset_aliases. If we
  80. are running in a threaded environment, and if two threads initialize
  81. 'charset_aliases' simultaneously, both will produce the same value,
  82. and everything will be ok if the two assignments to 'charset_aliases'
  83. are atomic. But I don't know what will happen if the two assignments mix. */
  84. #if __STDC__ != 1
  85. # define volatile /* empty */
  86. #endif
  87. /* Pointer to the contents of the charset.alias file, if it has already been
  88. read, else NULL. Its format is:
  89. ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0' */
  90. static const char * volatile charset_aliases;
  91. /* Return a pointer to the contents of the charset.alias file. */
  92. static const char *
  93. get_charset_aliases (void)
  94. {
  95. int c;
  96. const char *cp;
  97. cp = charset_aliases;
  98. if (cp == NULL)
  99. {
  100. #if !(defined DARWIN7 || defined VMS || defined WIN32_NATIVE || defined __CYGWIN__)
  101. FILE *fp;
  102. const char *dir;
  103. const char *base = "charset.alias";
  104. char *file_name;
  105. /* Make it possible to override the charset.alias location. This is
  106. necessary for running the testsuite before "make install". */
  107. dir = getenv ("CHARSETALIASDIR");
  108. if (dir == NULL || dir[0] == '\0')
  109. dir = relocate (LIBDIR);
  110. /* Concatenate dir and base into freshly allocated file_name. */
  111. {
  112. size_t dir_len = strlen (dir);
  113. size_t base_len = strlen (base);
  114. int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
  115. file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
  116. if (file_name != NULL)
  117. {
  118. memcpy (file_name, dir, dir_len);
  119. if (add_slash)
  120. file_name[dir_len] = DIRECTORY_SEPARATOR;
  121. memcpy (file_name + dir_len + add_slash, base, base_len + 1);
  122. }
  123. }
  124. if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
  125. /* Out of memory or file not found, treat it as empty. */
  126. cp = "";
  127. else
  128. {
  129. /* Parse the file's contents. */
  130. char *res_ptr = NULL;
  131. size_t res_size = 0;
  132. for (;;)
  133. {
  134. char buf1[50+1];
  135. char buf2[50+1];
  136. size_t l1, l2;
  137. char *old_res_ptr;
  138. c = getc (fp);
  139. if (c == EOF)
  140. break;
  141. if (c == '\n' || c == ' ' || c == '\t')
  142. continue;
  143. if (c == '#')
  144. {
  145. /* Skip comment, to end of line. */
  146. do
  147. c = getc (fp);
  148. while (!(c == EOF || c == '\n'));
  149. if (c == EOF)
  150. break;
  151. continue;
  152. }
  153. ungetc (c, fp);
  154. if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
  155. break;
  156. l1 = strlen (buf1);
  157. l2 = strlen (buf2);
  158. old_res_ptr = res_ptr;
  159. if (res_size == 0)
  160. {
  161. res_size = l1 + 1 + l2 + 1;
  162. res_ptr = (char *) malloc (res_size + 1);
  163. }
  164. else
  165. {
  166. res_size += l1 + 1 + l2 + 1;
  167. res_ptr = (char *) realloc (res_ptr, res_size + 1);
  168. }
  169. if (res_ptr == NULL)
  170. {
  171. /* Out of memory. */
  172. res_size = 0;
  173. if (old_res_ptr != NULL)
  174. free (old_res_ptr);
  175. break;
  176. }
  177. strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
  178. strcpy (res_ptr + res_size - (l2 + 1), buf2);
  179. }
  180. fclose (fp);
  181. if (res_size == 0)
  182. cp = "";
  183. else
  184. {
  185. *(res_ptr + res_size) = '\0';
  186. cp = res_ptr;
  187. }
  188. }
  189. if (file_name != NULL)
  190. free (file_name);
  191. #else
  192. # if defined DARWIN7
  193. /* To avoid the trouble of installing a file that is shared by many
  194. GNU packages -- many packaging systems have problems with this --,
  195. simply inline the aliases here. */
  196. cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
  197. "ISO8859-2" "\0" "ISO-8859-2" "\0"
  198. "ISO8859-4" "\0" "ISO-8859-4" "\0"
  199. "ISO8859-5" "\0" "ISO-8859-5" "\0"
  200. "ISO8859-7" "\0" "ISO-8859-7" "\0"
  201. "ISO8859-9" "\0" "ISO-8859-9" "\0"
  202. "ISO8859-13" "\0" "ISO-8859-13" "\0"
  203. "ISO8859-15" "\0" "ISO-8859-15" "\0"
  204. "KOI8-R" "\0" "KOI8-R" "\0"
  205. "KOI8-U" "\0" "KOI8-U" "\0"
  206. "CP866" "\0" "CP866" "\0"
  207. "CP949" "\0" "CP949" "\0"
  208. "CP1131" "\0" "CP1131" "\0"
  209. "CP1251" "\0" "CP1251" "\0"
  210. "eucCN" "\0" "GB2312" "\0"
  211. "GB2312" "\0" "GB2312" "\0"
  212. "eucJP" "\0" "EUC-JP" "\0"
  213. "eucKR" "\0" "EUC-KR" "\0"
  214. "Big5" "\0" "BIG5" "\0"
  215. "Big5HKSCS" "\0" "BIG5-HKSCS" "\0"
  216. "GBK" "\0" "GBK" "\0"
  217. "GB18030" "\0" "GB18030" "\0"
  218. "SJIS" "\0" "SHIFT_JIS" "\0"
  219. "ARMSCII-8" "\0" "ARMSCII-8" "\0"
  220. "PT154" "\0" "PT154" "\0"
  221. /*"ISCII-DEV" "\0" "?" "\0"*/
  222. "*" "\0" "UTF-8" "\0";
  223. # endif
  224. # if defined VMS
  225. /* To avoid the troubles of an extra file charset.alias_vms in the
  226. sources of many GNU packages, simply inline the aliases here. */
  227. /* The list of encodings is taken from the OpenVMS 7.3-1 documentation
  228. "Compaq C Run-Time Library Reference Manual for OpenVMS systems"
  229. section 10.7 "Handling Different Character Sets". */
  230. cp = "ISO8859-1" "\0" "ISO-8859-1" "\0"
  231. "ISO8859-2" "\0" "ISO-8859-2" "\0"
  232. "ISO8859-5" "\0" "ISO-8859-5" "\0"
  233. "ISO8859-7" "\0" "ISO-8859-7" "\0"
  234. "ISO8859-8" "\0" "ISO-8859-8" "\0"
  235. "ISO8859-9" "\0" "ISO-8859-9" "\0"
  236. /* Japanese */
  237. "eucJP" "\0" "EUC-JP" "\0"
  238. "SJIS" "\0" "SHIFT_JIS" "\0"
  239. "DECKANJI" "\0" "DEC-KANJI" "\0"
  240. "SDECKANJI" "\0" "EUC-JP" "\0"
  241. /* Chinese */
  242. "eucTW" "\0" "EUC-TW" "\0"
  243. "DECHANYU" "\0" "DEC-HANYU" "\0"
  244. "DECHANZI" "\0" "GB2312" "\0"
  245. /* Korean */
  246. "DECKOREAN" "\0" "EUC-KR" "\0";
  247. # endif
  248. # if defined WIN32_NATIVE || defined __CYGWIN__
  249. /* To avoid the troubles of installing a separate file in the same
  250. directory as the DLL and of retrieving the DLL's directory at
  251. runtime, simply inline the aliases here. */
  252. cp = "CP936" "\0" "GBK" "\0"
  253. "CP1361" "\0" "JOHAB" "\0"
  254. "CP20127" "\0" "ASCII" "\0"
  255. "CP20866" "\0" "KOI8-R" "\0"
  256. "CP20936" "\0" "GB2312" "\0"
  257. "CP21866" "\0" "KOI8-RU" "\0"
  258. "CP28591" "\0" "ISO-8859-1" "\0"
  259. "CP28592" "\0" "ISO-8859-2" "\0"
  260. "CP28593" "\0" "ISO-8859-3" "\0"
  261. "CP28594" "\0" "ISO-8859-4" "\0"
  262. "CP28595" "\0" "ISO-8859-5" "\0"
  263. "CP28596" "\0" "ISO-8859-6" "\0"
  264. "CP28597" "\0" "ISO-8859-7" "\0"
  265. "CP28598" "\0" "ISO-8859-8" "\0"
  266. "CP28599" "\0" "ISO-8859-9" "\0"
  267. "CP28605" "\0" "ISO-8859-15" "\0"
  268. "CP38598" "\0" "ISO-8859-8" "\0"
  269. "CP51932" "\0" "EUC-JP" "\0"
  270. "CP51936" "\0" "GB2312" "\0"
  271. "CP51949" "\0" "EUC-KR" "\0"
  272. "CP51950" "\0" "EUC-TW" "\0"
  273. "CP54936" "\0" "GB18030" "\0"
  274. "CP65001" "\0" "UTF-8" "\0";
  275. # endif
  276. #endif
  277. charset_aliases = cp;
  278. }
  279. return cp;
  280. }
  281. /* Determine the current locale's character encoding, and canonicalize it
  282. into one of the canonical names listed in config.charset.
  283. The result must not be freed; it is statically allocated.
  284. If the canonical name cannot be determined, the result is a non-canonical
  285. name. */
  286. #ifdef STATIC
  287. STATIC
  288. #endif
  289. const char *
  290. locale_charset (void)
  291. {
  292. const char *codeset;
  293. const char *aliases;
  294. #if !(defined WIN32_NATIVE || defined OS2)
  295. # if HAVE_LANGINFO_CODESET
  296. /* Most systems support nl_langinfo (CODESET) nowadays. */
  297. codeset = nl_langinfo (CODESET);
  298. # ifdef __CYGWIN__
  299. /* Cygwin 2006 does not have locales. nl_langinfo (CODESET) always
  300. returns "US-ASCII". As long as this is not fixed, return the suffix
  301. of the locale name from the environment variables (if present) or
  302. the codepage as a number. */
  303. if (codeset != NULL && strcmp (codeset, "US-ASCII") == 0)
  304. {
  305. const char *locale;
  306. static char buf[2 + 10 + 1];
  307. locale = getenv ("LC_ALL");
  308. if (locale == NULL || locale[0] == '\0')
  309. {
  310. locale = getenv ("LC_CTYPE");
  311. if (locale == NULL || locale[0] == '\0')
  312. locale = getenv ("LANG");
  313. }
  314. if (locale != NULL && locale[0] != '\0')
  315. {
  316. /* If the locale name contains an encoding after the dot, return
  317. it. */
  318. const char *dot = strchr (locale, '.');
  319. if (dot != NULL)
  320. {
  321. const char *modifier;
  322. dot++;
  323. /* Look for the possible @... trailer and remove it, if any. */
  324. modifier = strchr (dot, '@');
  325. if (modifier == NULL)
  326. return dot;
  327. if (modifier - dot < sizeof (buf))
  328. {
  329. memcpy (buf, dot, modifier - dot);
  330. buf [modifier - dot] = '\0';
  331. return buf;
  332. }
  333. }
  334. }
  335. /* Woe32 has a function returning the locale's codepage as a number. */
  336. sprintf (buf, "CP%u", GetACP ());
  337. codeset = buf;
  338. }
  339. # endif
  340. # else
  341. /* On old systems which lack it, use setlocale or getenv. */
  342. const char *locale = NULL;
  343. /* But most old systems don't have a complete set of locales. Some
  344. (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
  345. use setlocale here; it would return "C" when it doesn't support the
  346. locale name the user has set. */
  347. # if 0
  348. locale = setlocale (LC_CTYPE, NULL);
  349. # endif
  350. if (locale == NULL || locale[0] == '\0')
  351. {
  352. locale = getenv ("LC_ALL");
  353. if (locale == NULL || locale[0] == '\0')
  354. {
  355. locale = getenv ("LC_CTYPE");
  356. if (locale == NULL || locale[0] == '\0')
  357. locale = getenv ("LANG");
  358. }
  359. }
  360. /* On some old systems, one used to set locale = "iso8859_1". On others,
  361. you set it to "language_COUNTRY.charset". In any case, we resolve it
  362. through the charset.alias file. */
  363. codeset = locale;
  364. # endif
  365. #elif defined WIN32_NATIVE
  366. static char buf[2 + 10 + 1];
  367. /* Woe32 has a function returning the locale's codepage as a number. */
  368. sprintf (buf, "CP%u", GetACP ());
  369. codeset = buf;
  370. #elif defined OS2
  371. const char *locale;
  372. static char buf[2 + 10 + 1];
  373. ULONG cp[3];
  374. ULONG cplen;
  375. /* Allow user to override the codeset, as set in the operating system,
  376. with standard language environment variables. */
  377. locale = getenv ("LC_ALL");
  378. if (locale == NULL || locale[0] == '\0')
  379. {
  380. locale = getenv ("LC_CTYPE");
  381. if (locale == NULL || locale[0] == '\0')
  382. locale = getenv ("LANG");
  383. }
  384. if (locale != NULL && locale[0] != '\0')
  385. {
  386. /* If the locale name contains an encoding after the dot, return it. */
  387. const char *dot = strchr (locale, '.');
  388. if (dot != NULL)
  389. {
  390. const char *modifier;
  391. dot++;
  392. /* Look for the possible @... trailer and remove it, if any. */
  393. modifier = strchr (dot, '@');
  394. if (modifier == NULL)
  395. return dot;
  396. if (modifier - dot < sizeof (buf))
  397. {
  398. memcpy (buf, dot, modifier - dot);
  399. buf [modifier - dot] = '\0';
  400. return buf;
  401. }
  402. }
  403. /* Resolve through the charset.alias file. */
  404. codeset = locale;
  405. }
  406. else
  407. {
  408. /* OS/2 has a function returning the locale's codepage as a number. */
  409. if (DosQueryCp (sizeof (cp), cp, &cplen))
  410. codeset = "";
  411. else
  412. {
  413. sprintf (buf, "CP%u", cp[0]);
  414. codeset = buf;
  415. }
  416. }
  417. #endif
  418. if (codeset == NULL)
  419. /* The canonical name cannot be determined. */
  420. codeset = "";
  421. /* Resolve alias. */
  422. for (aliases = get_charset_aliases ();
  423. *aliases != '\0';
  424. aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
  425. if (strcmp (codeset, aliases) == 0
  426. || (aliases[0] == '*' && aliases[1] == '\0'))
  427. {
  428. codeset = aliases + strlen (aliases) + 1;
  429. break;
  430. }
  431. /* Don't return an empty string. GNU libc and GNU libiconv interpret
  432. the empty string as denoting "the locale's character encoding",
  433. thus GNU libiconv would call this function a second time. */
  434. if (codeset[0] == '\0')
  435. codeset = "ASCII";
  436. return codeset;
  437. }