cp932.h 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. /*
  2. * Copyright (C) 1999-2002, 2005 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Library General Public
  7. * License as published by the Free Software Foundation; either version 2
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Library General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Library General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
  18. * Fifth Floor, Boston, MA 02110-1301, USA.
  19. */
  20. /*
  21. * CP932
  22. */
  23. /*
  24. * Microsoft CP932 is a slightly extended version of SHIFT_JIS.
  25. * The differences between the EASTASIA/JIS/SHIFTJIS.TXT and the
  26. * VENDORS/MICSFT/WINDOWS/CP932.TXT tables found on ftp.unicode.org are
  27. * as follows:
  28. *
  29. * 1. CP932 uses ASCII, not JISX0201 Roman.
  30. *
  31. * 2. Some characters in the JISX0208 range are defined differently:
  32. *
  33. * code SHIFTJIS.TXT CP932.TXT
  34. * 0x815F 0x005C # REVERSE SOLIDUS 0xFF3C # FULLWIDTH REVERSE SOLIDUS
  35. * 0x8160 0x301C # WAVE DASH 0xFF5E # FULLWIDTH TILDE
  36. * 0x8161 0x2016 # DOUBLE VERTICAL LINE 0x2225 # PARALLEL TO
  37. * 0x817C 0x2212 # MINUS SIGN 0xFF0D # FULLWIDTH HYPHEN-MINUS
  38. * 0x8191 0x00A2 # CENT SIGN 0xFFE0 # FULLWIDTH CENT SIGN
  39. * 0x8192 0x00A3 # POUND SIGN 0xFFE1 # FULLWIDTH POUND SIGN
  40. * 0x81CA 0x00AC # NOT SIGN 0xFFE2 # FULLWIDTH NOT SIGN
  41. *
  42. * We don't implement the latter 6 of these changes, only the first one.
  43. * SHIFTJIS.TXT makes more sense. However, as a compromise with user
  44. * expectation, we implement the middle 5 of these changes in the
  45. * Unicode to CP932 direction. We don't implement the last one at all,
  46. * because it would collide with the mapping of 0xFA54.
  47. *
  48. * 3. A few new rows. See cp932ext.h.
  49. *
  50. * Many variants of CP932 (in GNU libc, JDK, OSF/1, Windows-2000, ICU) also
  51. * add:
  52. *
  53. * 4. Private area mappings:
  54. *
  55. * code Unicode
  56. * 0x{F0..F9}{40..7E,80..FC} U+E000..U+E757
  57. *
  58. * We add them too because, although there are backward compatibility problems
  59. * when a character from a private area is moved to an official Unicode code
  60. * point, they are useful for some people in practice.
  61. */
  62. #include "cp932ext.h"
  63. /*
  64. Conversion between SJIS codes (s1,s2) and JISX0208 codes (c1,c2):
  65. Example. (s1,s2) = 0x8140, (c1,c2) = 0x2121.
  66. 0x81 <= s1 <= 0x9F || 0xE0 <= s1 <= 0xEA,
  67. 0x40 <= s2 <= 0x7E || 0x80 <= s2 <= 0xFC,
  68. 0x21 <= c1 <= 0x74, 0x21 <= c2 <= 0x7E.
  69. Invariant:
  70. 94*2*(s1 < 0xE0 ? s1-0x81 : s1-0xC1) + (s2 < 0x80 ? s2-0x40 : s2-0x41)
  71. = 94*(c1-0x21)+(c2-0x21)
  72. Conversion (s1,s2) -> (c1,c2):
  73. t1 := (s1 < 0xE0 ? s1-0x81 : s1-0xC1)
  74. t2 := (s2 < 0x80 ? s2-0x40 : s2-0x41)
  75. c1 := 2*t1 + (t2 < 0x5E ? 0 : 1) + 0x21
  76. c2 := (t2 < 0x5E ? t2 : t2-0x5E) + 0x21
  77. Conversion (c1,c2) -> (s1,s2):
  78. t1 := (c1 - 0x21) >> 1
  79. t2 := ((c1 - 0x21) & 1) * 0x5E + (c2 - 0x21)
  80. s1 := (t1 < 0x1F ? t1+0x81 : t1+0xC1)
  81. s2 := (t2 < 0x3F ? t2+0x40 : t2+0x41)
  82. */
  83. static int
  84. cp932_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
  85. {
  86. unsigned char c = *s;
  87. if (c < 0x80)
  88. return ascii_mbtowc(conv,pwc,s,n);
  89. else if (c >= 0xa1 && c <= 0xdf)
  90. return jisx0201_mbtowc(conv,pwc,s,n);
  91. else {
  92. unsigned char s1, s2;
  93. s1 = c;
  94. if ((s1 >= 0x81 && s1 <= 0x9f && s1 != 0x87) || (s1 >= 0xe0 && s1 <= 0xea)) {
  95. if (n < 2)
  96. return RET_TOOFEW(0);
  97. s2 = s[1];
  98. if ((s2 >= 0x40 && s2 <= 0x7e) || (s2 >= 0x80 && s2 <= 0xfc)) {
  99. unsigned char t1 = (s1 < 0xe0 ? s1-0x81 : s1-0xc1);
  100. unsigned char t2 = (s2 < 0x80 ? s2-0x40 : s2-0x41);
  101. unsigned char buf[2];
  102. buf[0] = 2*t1 + (t2 < 0x5e ? 0 : 1) + 0x21;
  103. buf[1] = (t2 < 0x5e ? t2 : t2-0x5e) + 0x21;
  104. return jisx0208_mbtowc(conv,pwc,buf,2);
  105. }
  106. } else if ((s1 == 0x87) || (s1 >= 0xed && s1 <= 0xee) || (s1 >= 0xfa)) {
  107. if (n < 2)
  108. return RET_TOOFEW(0);
  109. return cp932ext_mbtowc(conv,pwc,s,2);
  110. } else if (s1 >= 0xf0 && s1 <= 0xf9) {
  111. /* User-defined range. See
  112. * Ken Lunde's "CJKV Information Processing", table 4-66, p. 206. */
  113. if (n < 2)
  114. return RET_TOOFEW(0);
  115. s2 = s[1];
  116. if ((s2 >= 0x40 && s2 <= 0x7e) || (s2 >= 0x80 && s2 <= 0xfc)) {
  117. *pwc = 0xe000 + 188*(s1 - 0xf0) + (s2 < 0x80 ? s2-0x40 : s2-0x41);
  118. return 2;
  119. }
  120. }
  121. return RET_ILSEQ;
  122. }
  123. }
  124. static int
  125. cp932_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
  126. {
  127. unsigned char buf[2];
  128. int ret;
  129. /* Try ASCII. */
  130. ret = ascii_wctomb(conv,buf,wc,1);
  131. if (ret != RET_ILUNI) {
  132. unsigned char c;
  133. if (ret != 1) abort();
  134. c = buf[0];
  135. if (c < 0x80) {
  136. r[0] = c;
  137. return 1;
  138. }
  139. }
  140. /* Try JIS X 0201-1976 Katakana. */
  141. ret = jisx0201_wctomb(conv,buf,wc,1);
  142. if (ret != RET_ILUNI) {
  143. unsigned char c;
  144. if (ret != 1) abort();
  145. c = buf[0];
  146. if (c >= 0xa1 && c <= 0xdf) {
  147. r[0] = c;
  148. return 1;
  149. }
  150. }
  151. /* Try JIS X 0208-1990. */
  152. ret = jisx0208_wctomb(conv,buf,wc,2);
  153. if (ret != RET_ILUNI) {
  154. unsigned char c1, c2;
  155. if (ret != 2) abort();
  156. if (n < 2)
  157. return RET_TOOSMALL;
  158. c1 = buf[0];
  159. c2 = buf[1];
  160. if ((c1 >= 0x21 && c1 <= 0x74) && (c2 >= 0x21 && c2 <= 0x7e)) {
  161. unsigned char t1 = (c1 - 0x21) >> 1;
  162. unsigned char t2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
  163. r[0] = (t1 < 0x1f ? t1+0x81 : t1+0xc1);
  164. r[1] = (t2 < 0x3f ? t2+0x40 : t2+0x41);
  165. return 2;
  166. }
  167. }
  168. /* Try CP932 extensions. */
  169. ret = cp932ext_wctomb(conv,buf,wc,2);
  170. if (ret != RET_ILUNI) {
  171. if (ret != 2) abort();
  172. if (n < 2)
  173. return RET_TOOSMALL;
  174. r[0] = buf[0];
  175. r[1] = buf[1];
  176. return 2;
  177. }
  178. /* User-defined range. See
  179. * Ken Lunde's "CJKV Information Processing", table 4-66, p. 206. */
  180. if (wc >= 0xe000 && wc < 0xe758) {
  181. unsigned char c1, c2;
  182. if (n < 2)
  183. return RET_TOOSMALL;
  184. c1 = (unsigned int) (wc - 0xe000) / 188;
  185. c2 = (unsigned int) (wc - 0xe000) % 188;
  186. r[0] = c1+0xf0;
  187. r[1] = (c2 < 0x3f ? c2+0x40 : c2+0x41);
  188. return 2;
  189. }
  190. /* Irreversible mappings. */
  191. if (wc == 0xff5e) {
  192. if (n < 2)
  193. return RET_TOOSMALL;
  194. r[0] = 0x81;
  195. r[1] = 0x60;
  196. return 2;
  197. }
  198. if (wc == 0x2225) {
  199. if (n < 2)
  200. return RET_TOOSMALL;
  201. r[0] = 0x81;
  202. r[1] = 0x61;
  203. return 2;
  204. }
  205. if (wc == 0xff0d) {
  206. if (n < 2)
  207. return RET_TOOSMALL;
  208. r[0] = 0x81;
  209. r[1] = 0x7c;
  210. return 2;
  211. }
  212. if (wc == 0xffe0) {
  213. if (n < 2)
  214. return RET_TOOSMALL;
  215. r[0] = 0x81;
  216. r[1] = 0x91;
  217. return 2;
  218. }
  219. if (wc == 0xffe1) {
  220. if (n < 2)
  221. return RET_TOOSMALL;
  222. r[0] = 0x81;
  223. r[1] = 0x92;
  224. return 2;
  225. }
  226. return RET_ILUNI;
  227. }