loop_unicode.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. /*
  2. * Copyright (C) 1999-2003, 2005-2006, 2008 Free Software Foundation, Inc.
  3. * This file is part of the GNU LIBICONV Library.
  4. *
  5. * The GNU LIBICONV Library is free software; you can redistribute it
  6. * and/or modify it under the terms of the GNU Library General Public
  7. * License as published by the Free Software Foundation; either version 2
  8. * of the License, or (at your option) any later version.
  9. *
  10. * The GNU LIBICONV Library is distributed in the hope that it will be
  11. * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Library General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Library General Public
  16. * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
  17. * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
  18. * Fifth Floor, Boston, MA 02110-1301, USA.
  19. */
  20. /* This file defines the conversion loop via Unicode as a pivot encoding. */
  21. /* Attempt to transliterate wc. Return code as in xxx_wctomb. */
  22. static int unicode_transliterate (conv_t cd, ucs4_t wc,
  23. unsigned char* outptr, size_t outleft)
  24. {
  25. if (cd->oflags & HAVE_HANGUL_JAMO) {
  26. /* Decompose Hangul into Jamo. Use double-width Jamo (contained
  27. in all Korean encodings and ISO-2022-JP-2), not half-width Jamo
  28. (contained in Unicode only). */
  29. ucs4_t buf[3];
  30. int ret = johab_hangul_decompose(cd,buf,wc);
  31. if (ret != RET_ILUNI) {
  32. /* we know 1 <= ret <= 3 */
  33. state_t backup_state = cd->ostate;
  34. unsigned char* backup_outptr = outptr;
  35. size_t backup_outleft = outleft;
  36. int i, sub_outcount;
  37. for (i = 0; i < ret; i++) {
  38. if (outleft == 0) {
  39. sub_outcount = RET_TOOSMALL;
  40. goto johab_hangul_failed;
  41. }
  42. sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft);
  43. if (sub_outcount <= RET_ILUNI)
  44. goto johab_hangul_failed;
  45. if (!(sub_outcount <= outleft)) abort();
  46. outptr += sub_outcount; outleft -= sub_outcount;
  47. }
  48. return outptr-backup_outptr;
  49. johab_hangul_failed:
  50. cd->ostate = backup_state;
  51. outptr = backup_outptr;
  52. outleft = backup_outleft;
  53. if (sub_outcount != RET_ILUNI)
  54. return RET_TOOSMALL;
  55. }
  56. }
  57. {
  58. /* Try to use a variant, but postfix it with
  59. U+303E IDEOGRAPHIC VARIATION INDICATOR
  60. (cf. Ken Lunde's "CJKV information processing", p. 188). */
  61. int indx = -1;
  62. if (wc == 0x3006)
  63. indx = 0;
  64. else if (wc == 0x30f6)
  65. indx = 1;
  66. else if (wc >= 0x4e00 && wc < 0xa000)
  67. indx = cjk_variants_indx[wc-0x4e00];
  68. if (indx >= 0) {
  69. for (;; indx++) {
  70. ucs4_t buf[2];
  71. unsigned short variant = cjk_variants[indx];
  72. unsigned short last = variant & 0x8000;
  73. variant &= 0x7fff;
  74. variant += 0x3000;
  75. buf[0] = variant; buf[1] = 0x303e;
  76. {
  77. state_t backup_state = cd->ostate;
  78. unsigned char* backup_outptr = outptr;
  79. size_t backup_outleft = outleft;
  80. int i, sub_outcount;
  81. for (i = 0; i < 2; i++) {
  82. if (outleft == 0) {
  83. sub_outcount = RET_TOOSMALL;
  84. goto variant_failed;
  85. }
  86. sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,buf[i],outleft);
  87. if (sub_outcount <= RET_ILUNI)
  88. goto variant_failed;
  89. if (!(sub_outcount <= outleft)) abort();
  90. outptr += sub_outcount; outleft -= sub_outcount;
  91. }
  92. return outptr-backup_outptr;
  93. variant_failed:
  94. cd->ostate = backup_state;
  95. outptr = backup_outptr;
  96. outleft = backup_outleft;
  97. if (sub_outcount != RET_ILUNI)
  98. return RET_TOOSMALL;
  99. }
  100. if (last)
  101. break;
  102. }
  103. }
  104. }
  105. if (wc >= 0x2018 && wc <= 0x201a) {
  106. /* Special case for quotation marks 0x2018, 0x2019, 0x201a */
  107. ucs4_t substitute =
  108. (cd->oflags & HAVE_QUOTATION_MARKS
  109. ? (wc == 0x201a ? 0x2018 : wc)
  110. : (cd->oflags & HAVE_ACCENTS
  111. ? (wc==0x2019 ? 0x00b4 : 0x0060) /* use accents */
  112. : 0x0027 /* use apostrophe */
  113. ) );
  114. int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,substitute,outleft);
  115. if (outcount != RET_ILUNI)
  116. return outcount;
  117. }
  118. {
  119. /* Use the transliteration table. */
  120. int indx = translit_index(wc);
  121. if (indx >= 0) {
  122. const unsigned int * cp = &translit_data[indx];
  123. unsigned int num = *cp++;
  124. state_t backup_state = cd->ostate;
  125. unsigned char* backup_outptr = outptr;
  126. size_t backup_outleft = outleft;
  127. unsigned int i;
  128. int sub_outcount;
  129. for (i = 0; i < num; i++) {
  130. if (outleft == 0) {
  131. sub_outcount = RET_TOOSMALL;
  132. goto translit_failed;
  133. }
  134. sub_outcount = cd->ofuncs.xxx_wctomb(cd,outptr,cp[i],outleft);
  135. if (sub_outcount == RET_ILUNI)
  136. /* Recursive transliteration. */
  137. sub_outcount = unicode_transliterate(cd,cp[i],outptr,outleft);
  138. if (sub_outcount <= RET_ILUNI)
  139. goto translit_failed;
  140. if (!(sub_outcount <= outleft)) abort();
  141. outptr += sub_outcount; outleft -= sub_outcount;
  142. }
  143. return outptr-backup_outptr;
  144. translit_failed:
  145. cd->ostate = backup_state;
  146. outptr = backup_outptr;
  147. outleft = backup_outleft;
  148. if (sub_outcount != RET_ILUNI)
  149. return RET_TOOSMALL;
  150. }
  151. }
  152. return RET_ILUNI;
  153. }
  154. #ifndef LIBICONV_PLUG
  155. struct uc_to_mb_fallback_locals {
  156. unsigned char* l_outbuf;
  157. size_t l_outbytesleft;
  158. int l_errno;
  159. };
  160. static void uc_to_mb_write_replacement (const char *buf, size_t buflen,
  161. void* callback_arg)
  162. {
  163. struct uc_to_mb_fallback_locals * plocals =
  164. (struct uc_to_mb_fallback_locals *) callback_arg;
  165. /* Do nothing if already encountered an error in a previous call. */
  166. if (plocals->l_errno == 0) {
  167. /* Attempt to copy the passed buffer to the output buffer. */
  168. if (plocals->l_outbytesleft < buflen)
  169. plocals->l_errno = E2BIG;
  170. else {
  171. memcpy(plocals->l_outbuf, buf, buflen);
  172. plocals->l_outbuf += buflen;
  173. plocals->l_outbytesleft -= buflen;
  174. }
  175. }
  176. }
  177. struct mb_to_uc_fallback_locals {
  178. conv_t l_cd;
  179. unsigned char* l_outbuf;
  180. size_t l_outbytesleft;
  181. int l_errno;
  182. };
  183. static void mb_to_uc_write_replacement (const unsigned int *buf, size_t buflen,
  184. void* callback_arg)
  185. {
  186. struct mb_to_uc_fallback_locals * plocals =
  187. (struct mb_to_uc_fallback_locals *) callback_arg;
  188. /* Do nothing if already encountered an error in a previous call. */
  189. if (plocals->l_errno == 0) {
  190. /* Attempt to convert the passed buffer to the target encoding. */
  191. conv_t cd = plocals->l_cd;
  192. unsigned char* outptr = plocals->l_outbuf;
  193. size_t outleft = plocals->l_outbytesleft;
  194. for (; buflen > 0; buf++, buflen--) {
  195. ucs4_t wc = *buf;
  196. int outcount;
  197. if (outleft == 0) {
  198. plocals->l_errno = E2BIG;
  199. break;
  200. }
  201. outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
  202. if (outcount != RET_ILUNI)
  203. goto outcount_ok;
  204. /* Handle Unicode tag characters (range U+E0000..U+E007F). */
  205. if ((wc >> 7) == (0xe0000 >> 7))
  206. goto outcount_zero;
  207. /* Try transliteration. */
  208. if (cd->transliterate) {
  209. outcount = unicode_transliterate(cd,wc,outptr,outleft);
  210. if (outcount != RET_ILUNI)
  211. goto outcount_ok;
  212. }
  213. if (cd->discard_ilseq) {
  214. outcount = 0;
  215. goto outcount_ok;
  216. }
  217. #ifndef LIBICONV_PLUG
  218. else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
  219. struct uc_to_mb_fallback_locals locals;
  220. locals.l_outbuf = outptr;
  221. locals.l_outbytesleft = outleft;
  222. locals.l_errno = 0;
  223. cd->fallbacks.uc_to_mb_fallback(wc,
  224. uc_to_mb_write_replacement,
  225. &locals,
  226. cd->fallbacks.data);
  227. if (locals.l_errno != 0) {
  228. plocals->l_errno = locals.l_errno;
  229. break;
  230. }
  231. outptr = locals.l_outbuf;
  232. outleft = locals.l_outbytesleft;
  233. outcount = 0;
  234. goto outcount_ok;
  235. }
  236. #endif
  237. outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
  238. if (outcount != RET_ILUNI)
  239. goto outcount_ok;
  240. plocals->l_errno = EILSEQ;
  241. break;
  242. outcount_ok:
  243. if (outcount < 0) {
  244. plocals->l_errno = E2BIG;
  245. break;
  246. }
  247. #ifndef LIBICONV_PLUG
  248. if (cd->hooks.uc_hook)
  249. (*cd->hooks.uc_hook)(wc, cd->hooks.data);
  250. #endif
  251. if (!(outcount <= outleft)) abort();
  252. outptr += outcount; outleft -= outcount;
  253. outcount_zero: ;
  254. }
  255. plocals->l_outbuf = outptr;
  256. plocals->l_outbytesleft = outleft;
  257. }
  258. }
  259. #endif /* !LIBICONV_PLUG */
  260. static size_t unicode_loop_convert (iconv_t icd,
  261. const char* * inbuf, size_t *inbytesleft,
  262. char* * outbuf, size_t *outbytesleft)
  263. {
  264. conv_t cd = (conv_t) icd;
  265. size_t result = 0;
  266. const unsigned char* inptr = (const unsigned char*) *inbuf;
  267. size_t inleft = *inbytesleft;
  268. unsigned char* outptr = (unsigned char*) *outbuf;
  269. size_t outleft = *outbytesleft;
  270. while (inleft > 0) {
  271. state_t last_istate = cd->istate;
  272. ucs4_t wc;
  273. int incount;
  274. int outcount;
  275. incount = cd->ifuncs.xxx_mbtowc(cd,&wc,inptr,inleft);
  276. if (incount < 0) {
  277. if ((unsigned int)(-1-incount) % 2 == (unsigned int)(-1-RET_ILSEQ) % 2) {
  278. /* Case 1: invalid input, possibly after a shift sequence */
  279. incount = DECODE_SHIFT_ILSEQ(incount);
  280. if (cd->discard_ilseq) {
  281. switch (cd->iindex) {
  282. case ei_ucs4: case ei_ucs4be: case ei_ucs4le:
  283. case ei_utf32: case ei_utf32be: case ei_utf32le:
  284. case ei_ucs4internal: case ei_ucs4swapped:
  285. incount += 4; break;
  286. case ei_ucs2: case ei_ucs2be: case ei_ucs2le:
  287. case ei_utf16: case ei_utf16be: case ei_utf16le:
  288. case ei_ucs2internal: case ei_ucs2swapped:
  289. incount += 2; break;
  290. default:
  291. incount += 1; break;
  292. }
  293. goto outcount_zero;
  294. }
  295. #ifndef LIBICONV_PLUG
  296. else if (cd->fallbacks.mb_to_uc_fallback != NULL) {
  297. unsigned int incount2;
  298. struct mb_to_uc_fallback_locals locals;
  299. switch (cd->iindex) {
  300. case ei_ucs4: case ei_ucs4be: case ei_ucs4le:
  301. case ei_utf32: case ei_utf32be: case ei_utf32le:
  302. case ei_ucs4internal: case ei_ucs4swapped:
  303. incount2 = 4; break;
  304. case ei_ucs2: case ei_ucs2be: case ei_ucs2le:
  305. case ei_utf16: case ei_utf16be: case ei_utf16le:
  306. case ei_ucs2internal: case ei_ucs2swapped:
  307. incount2 = 2; break;
  308. default:
  309. incount2 = 1; break;
  310. }
  311. locals.l_cd = cd;
  312. locals.l_outbuf = outptr;
  313. locals.l_outbytesleft = outleft;
  314. locals.l_errno = 0;
  315. cd->fallbacks.mb_to_uc_fallback((const char*)inptr+incount, incount2,
  316. mb_to_uc_write_replacement,
  317. &locals,
  318. cd->fallbacks.data);
  319. if (locals.l_errno != 0) {
  320. inptr += incount; inleft -= incount;
  321. errno = locals.l_errno;
  322. result = -1;
  323. break;
  324. }
  325. incount += incount2;
  326. outptr = locals.l_outbuf;
  327. outleft = locals.l_outbytesleft;
  328. result += 1;
  329. goto outcount_zero;
  330. }
  331. #endif
  332. inptr += incount; inleft -= incount;
  333. errno = EILSEQ;
  334. result = -1;
  335. break;
  336. }
  337. if (incount == RET_TOOFEW(0)) {
  338. /* Case 2: not enough bytes available to detect anything */
  339. errno = EINVAL;
  340. result = -1;
  341. break;
  342. }
  343. /* Case 3: k bytes read, but only a shift sequence */
  344. incount = DECODE_TOOFEW(incount);
  345. } else {
  346. /* Case 4: k bytes read, making up a wide character */
  347. if (outleft == 0) {
  348. cd->istate = last_istate;
  349. errno = E2BIG;
  350. result = -1;
  351. break;
  352. }
  353. outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
  354. if (outcount != RET_ILUNI)
  355. goto outcount_ok;
  356. /* Handle Unicode tag characters (range U+E0000..U+E007F). */
  357. if ((wc >> 7) == (0xe0000 >> 7))
  358. goto outcount_zero;
  359. /* Try transliteration. */
  360. result++;
  361. if (cd->transliterate) {
  362. outcount = unicode_transliterate(cd,wc,outptr,outleft);
  363. if (outcount != RET_ILUNI)
  364. goto outcount_ok;
  365. }
  366. if (cd->discard_ilseq) {
  367. outcount = 0;
  368. goto outcount_ok;
  369. }
  370. #ifndef LIBICONV_PLUG
  371. else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
  372. struct uc_to_mb_fallback_locals locals;
  373. locals.l_outbuf = outptr;
  374. locals.l_outbytesleft = outleft;
  375. locals.l_errno = 0;
  376. cd->fallbacks.uc_to_mb_fallback(wc,
  377. uc_to_mb_write_replacement,
  378. &locals,
  379. cd->fallbacks.data);
  380. if (locals.l_errno != 0) {
  381. cd->istate = last_istate;
  382. errno = locals.l_errno;
  383. return -1;
  384. }
  385. outptr = locals.l_outbuf;
  386. outleft = locals.l_outbytesleft;
  387. outcount = 0;
  388. goto outcount_ok;
  389. }
  390. #endif
  391. outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
  392. if (outcount != RET_ILUNI)
  393. goto outcount_ok;
  394. cd->istate = last_istate;
  395. errno = EILSEQ;
  396. result = -1;
  397. break;
  398. outcount_ok:
  399. if (outcount < 0) {
  400. cd->istate = last_istate;
  401. errno = E2BIG;
  402. result = -1;
  403. break;
  404. }
  405. #ifndef LIBICONV_PLUG
  406. if (cd->hooks.uc_hook)
  407. (*cd->hooks.uc_hook)(wc, cd->hooks.data);
  408. #endif
  409. if (!(outcount <= outleft)) abort();
  410. outptr += outcount; outleft -= outcount;
  411. }
  412. outcount_zero:
  413. if (!(incount <= inleft)) abort();
  414. inptr += incount; inleft -= incount;
  415. }
  416. *inbuf = (const char*) inptr;
  417. *inbytesleft = inleft;
  418. *outbuf = (char*) outptr;
  419. *outbytesleft = outleft;
  420. return result;
  421. }
  422. static size_t unicode_loop_reset (iconv_t icd,
  423. char* * outbuf, size_t *outbytesleft)
  424. {
  425. conv_t cd = (conv_t) icd;
  426. if (outbuf == NULL || *outbuf == NULL) {
  427. /* Reset the states. */
  428. memset(&cd->istate,'\0',sizeof(state_t));
  429. memset(&cd->ostate,'\0',sizeof(state_t));
  430. return 0;
  431. } else {
  432. size_t result = 0;
  433. if (cd->ifuncs.xxx_flushwc) {
  434. state_t last_istate = cd->istate;
  435. ucs4_t wc;
  436. if (cd->ifuncs.xxx_flushwc(cd, &wc)) {
  437. unsigned char* outptr = (unsigned char*) *outbuf;
  438. size_t outleft = *outbytesleft;
  439. int outcount = cd->ofuncs.xxx_wctomb(cd,outptr,wc,outleft);
  440. if (outcount != RET_ILUNI)
  441. goto outcount_ok;
  442. /* Handle Unicode tag characters (range U+E0000..U+E007F). */
  443. if ((wc >> 7) == (0xe0000 >> 7))
  444. goto outcount_zero;
  445. /* Try transliteration. */
  446. result++;
  447. if (cd->transliterate) {
  448. outcount = unicode_transliterate(cd,wc,outptr,outleft);
  449. if (outcount != RET_ILUNI)
  450. goto outcount_ok;
  451. }
  452. if (cd->discard_ilseq) {
  453. outcount = 0;
  454. goto outcount_ok;
  455. }
  456. #ifndef LIBICONV_PLUG
  457. else if (cd->fallbacks.uc_to_mb_fallback != NULL) {
  458. struct uc_to_mb_fallback_locals locals;
  459. locals.l_outbuf = outptr;
  460. locals.l_outbytesleft = outleft;
  461. locals.l_errno = 0;
  462. cd->fallbacks.uc_to_mb_fallback(wc,
  463. uc_to_mb_write_replacement,
  464. &locals,
  465. cd->fallbacks.data);
  466. if (locals.l_errno != 0) {
  467. cd->istate = last_istate;
  468. errno = locals.l_errno;
  469. return -1;
  470. }
  471. outptr = locals.l_outbuf;
  472. outleft = locals.l_outbytesleft;
  473. outcount = 0;
  474. goto outcount_ok;
  475. }
  476. #endif
  477. outcount = cd->ofuncs.xxx_wctomb(cd,outptr,0xFFFD,outleft);
  478. if (outcount != RET_ILUNI)
  479. goto outcount_ok;
  480. cd->istate = last_istate;
  481. errno = EILSEQ;
  482. return -1;
  483. outcount_ok:
  484. if (outcount < 0) {
  485. cd->istate = last_istate;
  486. errno = E2BIG;
  487. return -1;
  488. }
  489. #ifndef LIBICONV_PLUG
  490. if (cd->hooks.uc_hook)
  491. (*cd->hooks.uc_hook)(wc, cd->hooks.data);
  492. #endif
  493. if (!(outcount <= outleft)) abort();
  494. outptr += outcount;
  495. outleft -= outcount;
  496. outcount_zero:
  497. *outbuf = (char*) outptr;
  498. *outbytesleft = outleft;
  499. }
  500. }
  501. if (cd->ofuncs.xxx_reset) {
  502. unsigned char* outptr = (unsigned char*) *outbuf;
  503. size_t outleft = *outbytesleft;
  504. int outcount = cd->ofuncs.xxx_reset(cd,outptr,outleft);
  505. if (outcount < 0) {
  506. errno = E2BIG;
  507. return -1;
  508. }
  509. if (!(outcount <= outleft)) abort();
  510. *outbuf = (char*) (outptr + outcount);
  511. *outbytesleft = outleft - outcount;
  512. }
  513. memset(&cd->istate,'\0',sizeof(state_t));
  514. memset(&cd->ostate,'\0',sizeof(state_t));
  515. return result;
  516. }
  517. }