striconveh.c 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200
  1. /* Character set conversion with error handling.
  2. Copyright (C) 2001-2021 Free Software Foundation, Inc.
  3. Written by Bruno Haible and Simon Josefsson.
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU Lesser General Public License as published by
  6. the Free Software Foundation; either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <https://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "striconveh.h"
  17. #include <errno.h>
  18. #include <stdbool.h>
  19. #include <stdlib.h>
  20. #include <string.h>
  21. #if HAVE_ICONV
  22. # include <iconv.h>
  23. # include "unistr.h"
  24. #endif
  25. #include "c-strcase.h"
  26. #include "c-strcaseeq.h"
  27. #ifndef SIZE_MAX
  28. # define SIZE_MAX ((size_t) -1)
  29. #endif
  30. #if HAVE_ICONV
  31. /* The caller must provide an iconveh_t, not just an iconv_t, because when a
  32. conversion error occurs, we may have to determine the Unicode representation
  33. of the inconvertible character. */
  34. int
  35. iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
  36. {
  37. iconv_t cd;
  38. iconv_t cd1;
  39. iconv_t cd2;
  40. /* Avoid glibc-2.1 bug with EUC-KR. */
  41. # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  42. && !defined _LIBICONV_VERSION
  43. if (c_strcasecmp (from_codeset, "EUC-KR") == 0
  44. || c_strcasecmp (to_codeset, "EUC-KR") == 0)
  45. {
  46. errno = EINVAL;
  47. return -1;
  48. }
  49. # endif
  50. cd = iconv_open (to_codeset, from_codeset);
  51. if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
  52. cd1 = (iconv_t)(-1);
  53. else
  54. {
  55. cd1 = iconv_open ("UTF-8", from_codeset);
  56. if (cd1 == (iconv_t)(-1))
  57. {
  58. int saved_errno = errno;
  59. if (cd != (iconv_t)(-1))
  60. iconv_close (cd);
  61. errno = saved_errno;
  62. return -1;
  63. }
  64. }
  65. if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
  66. # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
  67. && !defined __UCLIBC__) \
  68. || _LIBICONV_VERSION >= 0x0105
  69. || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
  70. # endif
  71. )
  72. cd2 = (iconv_t)(-1);
  73. else
  74. {
  75. cd2 = iconv_open (to_codeset, "UTF-8");
  76. if (cd2 == (iconv_t)(-1))
  77. {
  78. int saved_errno = errno;
  79. if (cd1 != (iconv_t)(-1))
  80. iconv_close (cd1);
  81. if (cd != (iconv_t)(-1))
  82. iconv_close (cd);
  83. errno = saved_errno;
  84. return -1;
  85. }
  86. }
  87. cdp->cd = cd;
  88. cdp->cd1 = cd1;
  89. cdp->cd2 = cd2;
  90. return 0;
  91. }
  92. int
  93. iconveh_close (const iconveh_t *cd)
  94. {
  95. if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
  96. {
  97. /* Return -1, but preserve the errno from iconv_close. */
  98. int saved_errno = errno;
  99. if (cd->cd1 != (iconv_t)(-1))
  100. iconv_close (cd->cd1);
  101. if (cd->cd != (iconv_t)(-1))
  102. iconv_close (cd->cd);
  103. errno = saved_errno;
  104. return -1;
  105. }
  106. if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
  107. {
  108. /* Return -1, but preserve the errno from iconv_close. */
  109. int saved_errno = errno;
  110. if (cd->cd != (iconv_t)(-1))
  111. iconv_close (cd->cd);
  112. errno = saved_errno;
  113. return -1;
  114. }
  115. if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
  116. return -1;
  117. return 0;
  118. }
  119. /* iconv_carefully is like iconv, except that it stops as soon as it encounters
  120. a conversion error, and it returns in *INCREMENTED a boolean telling whether
  121. it has incremented the input pointers past the error location. */
  122. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  123. /* Irix iconv() inserts a NUL byte if it cannot convert.
  124. NetBSD iconv() inserts a question mark if it cannot convert.
  125. Only GNU libiconv and GNU libc are known to prefer to fail rather
  126. than doing a lossy conversion. */
  127. static size_t
  128. iconv_carefully (iconv_t cd,
  129. const char **inbuf, size_t *inbytesleft,
  130. char **outbuf, size_t *outbytesleft,
  131. bool *incremented)
  132. {
  133. const char *inptr = *inbuf;
  134. const char *inptr_end = inptr + *inbytesleft;
  135. char *outptr = *outbuf;
  136. size_t outsize = *outbytesleft;
  137. const char *inptr_before;
  138. size_t res;
  139. do
  140. {
  141. size_t insize;
  142. inptr_before = inptr;
  143. res = (size_t)(-1);
  144. for (insize = 1; inptr + insize <= inptr_end; insize++)
  145. {
  146. res = iconv (cd,
  147. (ICONV_CONST char **) &inptr, &insize,
  148. &outptr, &outsize);
  149. if (!(res == (size_t)(-1) && errno == EINVAL))
  150. break;
  151. /* iconv can eat up a shift sequence but give EINVAL while attempting
  152. to convert the first character. E.g. libiconv does this. */
  153. if (inptr > inptr_before)
  154. {
  155. res = 0;
  156. break;
  157. }
  158. }
  159. if (res == 0)
  160. {
  161. *outbuf = outptr;
  162. *outbytesleft = outsize;
  163. }
  164. }
  165. while (res == 0 && inptr < inptr_end);
  166. *inbuf = inptr;
  167. *inbytesleft = inptr_end - inptr;
  168. if (res != (size_t)(-1) && res > 0)
  169. {
  170. /* iconv() has already incremented INPTR. We cannot go back to a
  171. previous INPTR, otherwise the state inside CD would become invalid,
  172. if FROM_CODESET is a stateful encoding. So, tell the caller that
  173. *INBUF has already been incremented. */
  174. *incremented = (inptr > inptr_before);
  175. errno = EILSEQ;
  176. return (size_t)(-1);
  177. }
  178. else
  179. {
  180. *incremented = false;
  181. return res;
  182. }
  183. }
  184. # else
  185. # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
  186. (*(incremented) = false, \
  187. iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
  188. # endif
  189. /* iconv_carefully_1 is like iconv_carefully, except that it stops after
  190. converting one character or one shift sequence. */
  191. static size_t
  192. iconv_carefully_1 (iconv_t cd,
  193. const char **inbuf, size_t *inbytesleft,
  194. char **outbuf, size_t *outbytesleft,
  195. bool *incremented)
  196. {
  197. const char *inptr_before = *inbuf;
  198. const char *inptr = inptr_before;
  199. const char *inptr_end = inptr_before + *inbytesleft;
  200. char *outptr = *outbuf;
  201. size_t outsize = *outbytesleft;
  202. size_t res = (size_t)(-1);
  203. size_t insize;
  204. for (insize = 1; inptr_before + insize <= inptr_end; insize++)
  205. {
  206. inptr = inptr_before;
  207. res = iconv (cd,
  208. (ICONV_CONST char **) &inptr, &insize,
  209. &outptr, &outsize);
  210. if (!(res == (size_t)(-1) && errno == EINVAL))
  211. break;
  212. /* iconv can eat up a shift sequence but give EINVAL while attempting
  213. to convert the first character. E.g. libiconv does this. */
  214. if (inptr > inptr_before)
  215. {
  216. res = 0;
  217. break;
  218. }
  219. }
  220. *inbuf = inptr;
  221. *inbytesleft = inptr_end - inptr;
  222. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  223. /* Irix iconv() inserts a NUL byte if it cannot convert.
  224. NetBSD iconv() inserts a question mark if it cannot convert.
  225. Only GNU libiconv and GNU libc are known to prefer to fail rather
  226. than doing a lossy conversion. */
  227. if (res != (size_t)(-1) && res > 0)
  228. {
  229. /* iconv() has already incremented INPTR. We cannot go back to a
  230. previous INPTR, otherwise the state inside CD would become invalid,
  231. if FROM_CODESET is a stateful encoding. So, tell the caller that
  232. *INBUF has already been incremented. */
  233. *incremented = (inptr > inptr_before);
  234. errno = EILSEQ;
  235. return (size_t)(-1);
  236. }
  237. # endif
  238. if (res != (size_t)(-1))
  239. {
  240. *outbuf = outptr;
  241. *outbytesleft = outsize;
  242. }
  243. *incremented = false;
  244. return res;
  245. }
  246. /* utf8conv_carefully is like iconv, except that
  247. - it converts from UTF-8 to UTF-8,
  248. - it stops as soon as it encounters a conversion error, and it returns
  249. in *INCREMENTED a boolean telling whether it has incremented the input
  250. pointers past the error location,
  251. - if one_character_only is true, it stops after converting one
  252. character. */
  253. static size_t
  254. utf8conv_carefully (bool one_character_only,
  255. const char **inbuf, size_t *inbytesleft,
  256. char **outbuf, size_t *outbytesleft,
  257. bool *incremented)
  258. {
  259. const char *inptr = *inbuf;
  260. size_t insize = *inbytesleft;
  261. char *outptr = *outbuf;
  262. size_t outsize = *outbytesleft;
  263. size_t res;
  264. res = 0;
  265. do
  266. {
  267. ucs4_t uc;
  268. int n;
  269. int m;
  270. n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
  271. if (n < 0)
  272. {
  273. errno = (n == -2 ? EINVAL : EILSEQ);
  274. n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
  275. inptr += n;
  276. insize -= n;
  277. res = (size_t)(-1);
  278. *incremented = true;
  279. break;
  280. }
  281. if (outsize == 0)
  282. {
  283. errno = E2BIG;
  284. res = (size_t)(-1);
  285. *incremented = false;
  286. break;
  287. }
  288. m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
  289. if (m == -2)
  290. {
  291. errno = E2BIG;
  292. res = (size_t)(-1);
  293. *incremented = false;
  294. break;
  295. }
  296. inptr += n;
  297. insize -= n;
  298. if (m == -1)
  299. {
  300. errno = EILSEQ;
  301. res = (size_t)(-1);
  302. *incremented = true;
  303. break;
  304. }
  305. outptr += m;
  306. outsize -= m;
  307. }
  308. while (!one_character_only && insize > 0);
  309. *inbuf = inptr;
  310. *inbytesleft = insize;
  311. *outbuf = outptr;
  312. *outbytesleft = outsize;
  313. return res;
  314. }
  315. static int
  316. mem_cd_iconveh_internal (const char *src, size_t srclen,
  317. iconv_t cd, iconv_t cd1, iconv_t cd2,
  318. enum iconv_ilseq_handler handler,
  319. size_t extra_alloc,
  320. size_t *offsets,
  321. char **resultp, size_t *lengthp)
  322. {
  323. /* When a conversion error occurs, we cannot start using CD1 and CD2 at
  324. this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
  325. Instead, we have to start afresh from the beginning of SRC. */
  326. /* Use a temporary buffer, so that for small strings, a single malloc()
  327. call will be sufficient. */
  328. # define tmpbufsize 4096
  329. /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
  330. libiconv's UCS-4-INTERNAL encoding. */
  331. union { unsigned int align; char buf[tmpbufsize]; } tmp;
  332. # define tmpbuf tmp.buf
  333. char *initial_result;
  334. char *result;
  335. size_t allocated;
  336. size_t length;
  337. size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
  338. if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
  339. {
  340. initial_result = *resultp;
  341. allocated = *lengthp;
  342. }
  343. else
  344. {
  345. initial_result = tmpbuf;
  346. allocated = sizeof (tmpbuf);
  347. }
  348. result = initial_result;
  349. /* Test whether a direct conversion is possible at all. */
  350. if (cd == (iconv_t)(-1))
  351. goto indirectly;
  352. if (offsets != NULL)
  353. {
  354. size_t i;
  355. for (i = 0; i < srclen; i++)
  356. offsets[i] = (size_t)(-1);
  357. last_length = (size_t)(-1);
  358. }
  359. length = 0;
  360. /* First, try a direct conversion, and see whether a conversion error
  361. occurs at all. */
  362. {
  363. const char *inptr = src;
  364. size_t insize = srclen;
  365. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  366. # if defined _LIBICONV_VERSION \
  367. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  368. || defined __sun)
  369. /* Set to the initial state. */
  370. iconv (cd, NULL, NULL, NULL, NULL);
  371. # endif
  372. while (insize > 0)
  373. {
  374. char *outptr = result + length;
  375. size_t outsize = allocated - extra_alloc - length;
  376. bool incremented;
  377. size_t res;
  378. bool grow;
  379. if (offsets != NULL)
  380. {
  381. if (length != last_length) /* ensure that offset[] be increasing */
  382. {
  383. offsets[inptr - src] = length;
  384. last_length = length;
  385. }
  386. res = iconv_carefully_1 (cd,
  387. &inptr, &insize,
  388. &outptr, &outsize,
  389. &incremented);
  390. }
  391. else
  392. /* Use iconv_carefully instead of iconv here, because:
  393. - If TO_CODESET is UTF-8, we can do the error handling in this
  394. loop, no need for a second loop,
  395. - With iconv() implementations other than GNU libiconv and GNU
  396. libc, if we use iconv() in a big swoop, checking for an E2BIG
  397. return, we lose the number of irreversible conversions. */
  398. res = iconv_carefully (cd,
  399. &inptr, &insize,
  400. &outptr, &outsize,
  401. &incremented);
  402. length = outptr - result;
  403. grow = (length + extra_alloc > allocated / 2);
  404. if (res == (size_t)(-1))
  405. {
  406. if (errno == E2BIG)
  407. grow = true;
  408. else if (errno == EINVAL)
  409. break;
  410. else if (errno == EILSEQ && handler != iconveh_error)
  411. {
  412. if (cd2 == (iconv_t)(-1))
  413. {
  414. /* TO_CODESET is UTF-8. */
  415. /* Error handling can produce up to 1 byte of output. */
  416. if (length + 1 + extra_alloc > allocated)
  417. {
  418. char *memory;
  419. allocated = 2 * allocated;
  420. if (length + 1 + extra_alloc > allocated)
  421. abort ();
  422. if (result == initial_result)
  423. memory = (char *) malloc (allocated);
  424. else
  425. memory = (char *) realloc (result, allocated);
  426. if (memory == NULL)
  427. {
  428. if (result != initial_result)
  429. free (result);
  430. errno = ENOMEM;
  431. return -1;
  432. }
  433. if (result == initial_result)
  434. memcpy (memory, initial_result, length);
  435. result = memory;
  436. grow = false;
  437. }
  438. /* The input is invalid in FROM_CODESET. Eat up one byte
  439. and emit a question mark. */
  440. if (!incremented)
  441. {
  442. if (insize == 0)
  443. abort ();
  444. inptr++;
  445. insize--;
  446. }
  447. result[length] = '?';
  448. length++;
  449. }
  450. else
  451. goto indirectly;
  452. }
  453. else
  454. {
  455. if (result != initial_result)
  456. {
  457. int saved_errno = errno;
  458. free (result);
  459. errno = saved_errno;
  460. }
  461. return -1;
  462. }
  463. }
  464. if (insize == 0)
  465. break;
  466. if (grow)
  467. {
  468. char *memory;
  469. allocated = 2 * allocated;
  470. if (result == initial_result)
  471. memory = (char *) malloc (allocated);
  472. else
  473. memory = (char *) realloc (result, allocated);
  474. if (memory == NULL)
  475. {
  476. if (result != initial_result)
  477. free (result);
  478. errno = ENOMEM;
  479. return -1;
  480. }
  481. if (result == initial_result)
  482. memcpy (memory, initial_result, length);
  483. result = memory;
  484. }
  485. }
  486. }
  487. /* Now get the conversion state back to the initial state.
  488. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  489. #if defined _LIBICONV_VERSION \
  490. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  491. || defined __sun)
  492. for (;;)
  493. {
  494. char *outptr = result + length;
  495. size_t outsize = allocated - extra_alloc - length;
  496. size_t res;
  497. res = iconv (cd, NULL, NULL, &outptr, &outsize);
  498. length = outptr - result;
  499. if (res == (size_t)(-1))
  500. {
  501. if (errno == E2BIG)
  502. {
  503. char *memory;
  504. allocated = 2 * allocated;
  505. if (result == initial_result)
  506. memory = (char *) malloc (allocated);
  507. else
  508. memory = (char *) realloc (result, allocated);
  509. if (memory == NULL)
  510. {
  511. if (result != initial_result)
  512. free (result);
  513. errno = ENOMEM;
  514. return -1;
  515. }
  516. if (result == initial_result)
  517. memcpy (memory, initial_result, length);
  518. result = memory;
  519. }
  520. else
  521. {
  522. if (result != initial_result)
  523. {
  524. int saved_errno = errno;
  525. free (result);
  526. errno = saved_errno;
  527. }
  528. return -1;
  529. }
  530. }
  531. else
  532. break;
  533. }
  534. #endif
  535. /* The direct conversion succeeded. */
  536. goto done;
  537. indirectly:
  538. /* The direct conversion failed.
  539. Use a conversion through UTF-8. */
  540. if (offsets != NULL)
  541. {
  542. size_t i;
  543. for (i = 0; i < srclen; i++)
  544. offsets[i] = (size_t)(-1);
  545. last_length = (size_t)(-1);
  546. }
  547. length = 0;
  548. {
  549. const bool slowly = (offsets != NULL || handler == iconveh_error);
  550. # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
  551. char utf8buf[utf8bufsize + 1];
  552. size_t utf8len = 0;
  553. const char *in1ptr = src;
  554. size_t in1size = srclen;
  555. bool do_final_flush1 = true;
  556. bool do_final_flush2 = true;
  557. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  558. # if defined _LIBICONV_VERSION \
  559. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  560. || defined __sun)
  561. /* Set to the initial state. */
  562. if (cd1 != (iconv_t)(-1))
  563. iconv (cd1, NULL, NULL, NULL, NULL);
  564. if (cd2 != (iconv_t)(-1))
  565. iconv (cd2, NULL, NULL, NULL, NULL);
  566. # endif
  567. while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
  568. {
  569. char *out1ptr = utf8buf + utf8len;
  570. size_t out1size = utf8bufsize - utf8len;
  571. bool incremented1;
  572. size_t res1;
  573. int errno1;
  574. /* Conversion step 1: from FROM_CODESET to UTF-8. */
  575. if (in1size > 0)
  576. {
  577. if (offsets != NULL
  578. && length != last_length) /* ensure that offset[] be increasing */
  579. {
  580. offsets[in1ptr - src] = length;
  581. last_length = length;
  582. }
  583. if (cd1 != (iconv_t)(-1))
  584. {
  585. if (slowly)
  586. res1 = iconv_carefully_1 (cd1,
  587. &in1ptr, &in1size,
  588. &out1ptr, &out1size,
  589. &incremented1);
  590. else
  591. res1 = iconv_carefully (cd1,
  592. &in1ptr, &in1size,
  593. &out1ptr, &out1size,
  594. &incremented1);
  595. }
  596. else
  597. {
  598. /* FROM_CODESET is UTF-8. */
  599. res1 = utf8conv_carefully (slowly,
  600. &in1ptr, &in1size,
  601. &out1ptr, &out1size,
  602. &incremented1);
  603. }
  604. }
  605. else if (do_final_flush1)
  606. {
  607. /* Now get the conversion state of CD1 back to the initial state.
  608. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  609. # if defined _LIBICONV_VERSION \
  610. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  611. || defined __sun)
  612. if (cd1 != (iconv_t)(-1))
  613. res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
  614. else
  615. # endif
  616. res1 = 0;
  617. do_final_flush1 = false;
  618. incremented1 = true;
  619. }
  620. else
  621. {
  622. res1 = 0;
  623. incremented1 = true;
  624. }
  625. if (res1 == (size_t)(-1)
  626. && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
  627. {
  628. if (result != initial_result)
  629. {
  630. int saved_errno = errno;
  631. free (result);
  632. errno = saved_errno;
  633. }
  634. return -1;
  635. }
  636. if (res1 == (size_t)(-1)
  637. && errno == EILSEQ && handler != iconveh_error)
  638. {
  639. /* The input is invalid in FROM_CODESET. Eat up one byte and
  640. emit a question mark. Room for the question mark was allocated
  641. at the end of utf8buf. */
  642. if (!incremented1)
  643. {
  644. if (in1size == 0)
  645. abort ();
  646. in1ptr++;
  647. in1size--;
  648. }
  649. *out1ptr++ = '?';
  650. res1 = 0;
  651. }
  652. errno1 = errno;
  653. utf8len = out1ptr - utf8buf;
  654. if (offsets != NULL
  655. || in1size == 0
  656. || utf8len > utf8bufsize / 2
  657. || (res1 == (size_t)(-1) && errno1 == E2BIG))
  658. {
  659. /* Conversion step 2: from UTF-8 to TO_CODESET. */
  660. const char *in2ptr = utf8buf;
  661. size_t in2size = utf8len;
  662. while (in2size > 0
  663. || (in1size == 0 && !do_final_flush1 && do_final_flush2))
  664. {
  665. char *out2ptr = result + length;
  666. size_t out2size = allocated - extra_alloc - length;
  667. bool incremented2;
  668. size_t res2;
  669. bool grow;
  670. if (in2size > 0)
  671. {
  672. if (cd2 != (iconv_t)(-1))
  673. res2 = iconv_carefully (cd2,
  674. &in2ptr, &in2size,
  675. &out2ptr, &out2size,
  676. &incremented2);
  677. else
  678. /* TO_CODESET is UTF-8. */
  679. res2 = utf8conv_carefully (false,
  680. &in2ptr, &in2size,
  681. &out2ptr, &out2size,
  682. &incremented2);
  683. }
  684. else /* in1size == 0 && !do_final_flush1
  685. && in2size == 0 && do_final_flush2 */
  686. {
  687. /* Now get the conversion state of CD1 back to the initial
  688. state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  689. # if defined _LIBICONV_VERSION \
  690. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  691. || defined __sun)
  692. if (cd2 != (iconv_t)(-1))
  693. res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
  694. else
  695. # endif
  696. res2 = 0;
  697. do_final_flush2 = false;
  698. incremented2 = true;
  699. }
  700. length = out2ptr - result;
  701. grow = (length + extra_alloc > allocated / 2);
  702. if (res2 == (size_t)(-1))
  703. {
  704. if (errno == E2BIG)
  705. grow = true;
  706. else if (errno == EINVAL)
  707. break;
  708. else if (errno == EILSEQ && handler != iconveh_error)
  709. {
  710. /* Error handling can produce up to 10 bytes of ASCII
  711. output. But TO_CODESET may be UCS-2, UTF-16 or
  712. UCS-4, so use CD2 here as well. */
  713. char scratchbuf[10];
  714. size_t scratchlen;
  715. ucs4_t uc;
  716. const char *inptr;
  717. size_t insize;
  718. size_t res;
  719. if (incremented2)
  720. {
  721. if (u8_prev (&uc, (const uint8_t *) in2ptr,
  722. (const uint8_t *) utf8buf)
  723. == NULL)
  724. abort ();
  725. }
  726. else
  727. {
  728. int n;
  729. if (in2size == 0)
  730. abort ();
  731. n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
  732. in2size);
  733. in2ptr += n;
  734. in2size -= n;
  735. }
  736. if (handler == iconveh_escape_sequence)
  737. {
  738. static char hex[16] = "0123456789ABCDEF";
  739. scratchlen = 0;
  740. scratchbuf[scratchlen++] = '\\';
  741. if (uc < 0x10000)
  742. scratchbuf[scratchlen++] = 'u';
  743. else
  744. {
  745. scratchbuf[scratchlen++] = 'U';
  746. scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
  747. scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
  748. scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
  749. scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
  750. }
  751. scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
  752. scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
  753. scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
  754. scratchbuf[scratchlen++] = hex[uc & 15];
  755. }
  756. else
  757. {
  758. scratchbuf[0] = '?';
  759. scratchlen = 1;
  760. }
  761. inptr = scratchbuf;
  762. insize = scratchlen;
  763. if (cd2 != (iconv_t)(-1))
  764. res = iconv (cd2,
  765. (ICONV_CONST char **) &inptr, &insize,
  766. &out2ptr, &out2size);
  767. else
  768. {
  769. /* TO_CODESET is UTF-8. */
  770. if (out2size >= insize)
  771. {
  772. memcpy (out2ptr, inptr, insize);
  773. out2ptr += insize;
  774. out2size -= insize;
  775. inptr += insize;
  776. insize = 0;
  777. res = 0;
  778. }
  779. else
  780. {
  781. errno = E2BIG;
  782. res = (size_t)(-1);
  783. }
  784. }
  785. length = out2ptr - result;
  786. if (res == (size_t)(-1) && errno == E2BIG)
  787. {
  788. char *memory;
  789. allocated = 2 * allocated;
  790. if (length + 1 + extra_alloc > allocated)
  791. abort ();
  792. if (result == initial_result)
  793. memory = (char *) malloc (allocated);
  794. else
  795. memory = (char *) realloc (result, allocated);
  796. if (memory == NULL)
  797. {
  798. if (result != initial_result)
  799. free (result);
  800. errno = ENOMEM;
  801. return -1;
  802. }
  803. if (result == initial_result)
  804. memcpy (memory, initial_result, length);
  805. result = memory;
  806. grow = false;
  807. out2ptr = result + length;
  808. out2size = allocated - extra_alloc - length;
  809. if (cd2 != (iconv_t)(-1))
  810. res = iconv (cd2,
  811. (ICONV_CONST char **) &inptr,
  812. &insize,
  813. &out2ptr, &out2size);
  814. else
  815. {
  816. /* TO_CODESET is UTF-8. */
  817. if (!(out2size >= insize))
  818. abort ();
  819. memcpy (out2ptr, inptr, insize);
  820. out2ptr += insize;
  821. out2size -= insize;
  822. inptr += insize;
  823. insize = 0;
  824. res = 0;
  825. }
  826. length = out2ptr - result;
  827. }
  828. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  829. /* Irix iconv() inserts a NUL byte if it cannot convert.
  830. NetBSD iconv() inserts a question mark if it cannot
  831. convert.
  832. Only GNU libiconv and GNU libc are known to prefer
  833. to fail rather than doing a lossy conversion. */
  834. if (res != (size_t)(-1) && res > 0)
  835. {
  836. errno = EILSEQ;
  837. res = (size_t)(-1);
  838. }
  839. # endif
  840. if (res == (size_t)(-1))
  841. {
  842. /* Failure converting the ASCII replacement. */
  843. if (result != initial_result)
  844. {
  845. int saved_errno = errno;
  846. free (result);
  847. errno = saved_errno;
  848. }
  849. return -1;
  850. }
  851. }
  852. else
  853. {
  854. if (result != initial_result)
  855. {
  856. int saved_errno = errno;
  857. free (result);
  858. errno = saved_errno;
  859. }
  860. return -1;
  861. }
  862. }
  863. if (!(in2size > 0
  864. || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
  865. break;
  866. if (grow)
  867. {
  868. char *memory;
  869. allocated = 2 * allocated;
  870. if (result == initial_result)
  871. memory = (char *) malloc (allocated);
  872. else
  873. memory = (char *) realloc (result, allocated);
  874. if (memory == NULL)
  875. {
  876. if (result != initial_result)
  877. free (result);
  878. errno = ENOMEM;
  879. return -1;
  880. }
  881. if (result == initial_result)
  882. memcpy (memory, initial_result, length);
  883. result = memory;
  884. }
  885. }
  886. /* Move the remaining bytes to the beginning of utf8buf. */
  887. if (in2size > 0)
  888. memmove (utf8buf, in2ptr, in2size);
  889. utf8len = in2size;
  890. }
  891. if (res1 == (size_t)(-1))
  892. {
  893. if (errno1 == EINVAL)
  894. in1size = 0;
  895. else if (errno1 == EILSEQ)
  896. {
  897. if (result != initial_result)
  898. free (result);
  899. errno = errno1;
  900. return -1;
  901. }
  902. }
  903. }
  904. # undef utf8bufsize
  905. }
  906. done:
  907. /* Now the final memory allocation. */
  908. if (result == tmpbuf)
  909. {
  910. size_t memsize = length + extra_alloc;
  911. if (*resultp != NULL && *lengthp >= memsize)
  912. result = *resultp;
  913. else
  914. {
  915. char *memory;
  916. memory = (char *) malloc (memsize > 0 ? memsize : 1);
  917. if (memory != NULL)
  918. result = memory;
  919. else
  920. {
  921. errno = ENOMEM;
  922. return -1;
  923. }
  924. }
  925. memcpy (result, tmpbuf, length);
  926. }
  927. else if (result != *resultp && length + extra_alloc < allocated)
  928. {
  929. /* Shrink the allocated memory if possible. */
  930. size_t memsize = length + extra_alloc;
  931. char *memory;
  932. memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
  933. if (memory != NULL)
  934. result = memory;
  935. }
  936. *resultp = result;
  937. *lengthp = length;
  938. return 0;
  939. # undef tmpbuf
  940. # undef tmpbufsize
  941. }
  942. int
  943. mem_cd_iconveh (const char *src, size_t srclen,
  944. const iconveh_t *cd,
  945. enum iconv_ilseq_handler handler,
  946. size_t *offsets,
  947. char **resultp, size_t *lengthp)
  948. {
  949. return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
  950. handler, 0, offsets, resultp, lengthp);
  951. }
  952. char *
  953. str_cd_iconveh (const char *src,
  954. const iconveh_t *cd,
  955. enum iconv_ilseq_handler handler)
  956. {
  957. /* For most encodings, a trailing NUL byte in the input will be converted
  958. to a trailing NUL byte in the output. But not for UTF-7. So that this
  959. function is usable for UTF-7, we have to exclude the NUL byte from the
  960. conversion and add it by hand afterwards. */
  961. char *result = NULL;
  962. size_t length = 0;
  963. int retval = mem_cd_iconveh_internal (src, strlen (src),
  964. cd->cd, cd->cd1, cd->cd2, handler, 1,
  965. NULL, &result, &length);
  966. if (retval < 0)
  967. {
  968. if (result != NULL)
  969. {
  970. int saved_errno = errno;
  971. free (result);
  972. errno = saved_errno;
  973. }
  974. return NULL;
  975. }
  976. /* Add the terminating NUL byte. */
  977. result[length] = '\0';
  978. return result;
  979. }
  980. #endif
  981. int
  982. mem_iconveh (const char *src, size_t srclen,
  983. const char *from_codeset, const char *to_codeset,
  984. enum iconv_ilseq_handler handler,
  985. size_t *offsets,
  986. char **resultp, size_t *lengthp)
  987. {
  988. if (srclen == 0)
  989. {
  990. /* Nothing to convert. */
  991. *lengthp = 0;
  992. return 0;
  993. }
  994. else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
  995. {
  996. char *result;
  997. if (*resultp != NULL && *lengthp >= srclen)
  998. result = *resultp;
  999. else
  1000. {
  1001. result = (char *) malloc (srclen);
  1002. if (result == NULL)
  1003. {
  1004. errno = ENOMEM;
  1005. return -1;
  1006. }
  1007. }
  1008. memcpy (result, src, srclen);
  1009. *resultp = result;
  1010. *lengthp = srclen;
  1011. return 0;
  1012. }
  1013. else
  1014. {
  1015. #if HAVE_ICONV
  1016. iconveh_t cd;
  1017. char *result;
  1018. size_t length;
  1019. int retval;
  1020. if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
  1021. return -1;
  1022. result = *resultp;
  1023. length = *lengthp;
  1024. retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
  1025. &result, &length);
  1026. if (retval < 0)
  1027. {
  1028. /* Close cd, but preserve the errno from str_cd_iconv. */
  1029. int saved_errno = errno;
  1030. iconveh_close (&cd);
  1031. errno = saved_errno;
  1032. }
  1033. else
  1034. {
  1035. if (iconveh_close (&cd) < 0)
  1036. {
  1037. /* Return -1, but free the allocated memory, and while doing
  1038. that, preserve the errno from iconveh_close. */
  1039. int saved_errno = errno;
  1040. if (result != *resultp && result != NULL)
  1041. free (result);
  1042. errno = saved_errno;
  1043. return -1;
  1044. }
  1045. *resultp = result;
  1046. *lengthp = length;
  1047. }
  1048. return retval;
  1049. #else
  1050. /* This is a different error code than if iconv_open existed but didn't
  1051. support from_codeset and to_codeset, so that the caller can emit
  1052. an error message such as
  1053. "iconv() is not supported. Installing GNU libiconv and
  1054. then reinstalling this package would fix this." */
  1055. errno = ENOSYS;
  1056. return -1;
  1057. #endif
  1058. }
  1059. }
  1060. char *
  1061. str_iconveh (const char *src,
  1062. const char *from_codeset, const char *to_codeset,
  1063. enum iconv_ilseq_handler handler)
  1064. {
  1065. if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
  1066. {
  1067. char *result = strdup (src);
  1068. if (result == NULL)
  1069. errno = ENOMEM;
  1070. return result;
  1071. }
  1072. else
  1073. {
  1074. #if HAVE_ICONV
  1075. iconveh_t cd;
  1076. char *result;
  1077. if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
  1078. return NULL;
  1079. result = str_cd_iconveh (src, &cd, handler);
  1080. if (result == NULL)
  1081. {
  1082. /* Close cd, but preserve the errno from str_cd_iconv. */
  1083. int saved_errno = errno;
  1084. iconveh_close (&cd);
  1085. errno = saved_errno;
  1086. }
  1087. else
  1088. {
  1089. if (iconveh_close (&cd) < 0)
  1090. {
  1091. /* Return NULL, but free the allocated memory, and while doing
  1092. that, preserve the errno from iconveh_close. */
  1093. int saved_errno = errno;
  1094. free (result);
  1095. errno = saved_errno;
  1096. return NULL;
  1097. }
  1098. }
  1099. return result;
  1100. #else
  1101. /* This is a different error code than if iconv_open existed but didn't
  1102. support from_codeset and to_codeset, so that the caller can emit
  1103. an error message such as
  1104. "iconv() is not supported. Installing GNU libiconv and
  1105. then reinstalling this package would fix this." */
  1106. errno = ENOSYS;
  1107. return NULL;
  1108. #endif
  1109. }
  1110. }