striconveh.c 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237
  1. /* Character set conversion with error handling.
  2. Copyright (C) 2001-2022 Free Software Foundation, Inc.
  3. Written by Bruno Haible and Simon Josefsson.
  4. This file is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU Lesser General Public License as
  6. published by the Free Software Foundation; either version 2.1 of the
  7. License, or (at your option) any later version.
  8. This file is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public License
  13. along with this program. If not, see <https://www.gnu.org/licenses/>. */
  14. #include <config.h>
  15. /* Specification. */
  16. #include "striconveh.h"
  17. #include <errno.h>
  18. #include <stdbool.h>
  19. #include <stdlib.h>
  20. #include <string.h>
  21. #if HAVE_ICONV
  22. # include <iconv.h>
  23. # include "unistr.h"
  24. #endif
  25. #include "c-strcase.h"
  26. #include "c-strcaseeq.h"
  27. #ifndef SIZE_MAX
  28. # define SIZE_MAX ((size_t) -1)
  29. #endif
  30. #if HAVE_ICONV
  31. /* The caller must provide an iconveh_t, not just an iconv_t, because when a
  32. conversion error occurs, we may have to determine the Unicode representation
  33. of the inconvertible character. */
  34. int
  35. iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
  36. {
  37. iconv_t cd;
  38. iconv_t cd1;
  39. iconv_t cd2;
  40. /* Avoid glibc-2.1 bug with EUC-KR. */
  41. # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  42. && !defined _LIBICONV_VERSION
  43. if (c_strcasecmp (from_codeset, "EUC-KR") == 0
  44. || c_strcasecmp (to_codeset, "EUC-KR") == 0)
  45. {
  46. errno = EINVAL;
  47. return -1;
  48. }
  49. # endif
  50. cd = iconv_open (to_codeset, from_codeset);
  51. if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
  52. cd1 = (iconv_t)(-1);
  53. else
  54. {
  55. cd1 = iconv_open ("UTF-8", from_codeset);
  56. if (cd1 == (iconv_t)(-1))
  57. {
  58. int saved_errno = errno;
  59. if (cd != (iconv_t)(-1))
  60. iconv_close (cd);
  61. errno = saved_errno;
  62. return -1;
  63. }
  64. }
  65. if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
  66. # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
  67. && !defined __UCLIBC__) \
  68. || _LIBICONV_VERSION >= 0x0105
  69. || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
  70. # endif
  71. )
  72. cd2 = (iconv_t)(-1);
  73. else
  74. {
  75. cd2 = iconv_open (to_codeset, "UTF-8");
  76. if (cd2 == (iconv_t)(-1))
  77. {
  78. int saved_errno = errno;
  79. if (cd1 != (iconv_t)(-1))
  80. iconv_close (cd1);
  81. if (cd != (iconv_t)(-1))
  82. iconv_close (cd);
  83. errno = saved_errno;
  84. return -1;
  85. }
  86. }
  87. cdp->cd = cd;
  88. cdp->cd1 = cd1;
  89. cdp->cd2 = cd2;
  90. return 0;
  91. }
  92. int
  93. iconveh_close (const iconveh_t *cd)
  94. {
  95. if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
  96. {
  97. /* Return -1, but preserve the errno from iconv_close. */
  98. int saved_errno = errno;
  99. if (cd->cd1 != (iconv_t)(-1))
  100. iconv_close (cd->cd1);
  101. if (cd->cd != (iconv_t)(-1))
  102. iconv_close (cd->cd);
  103. errno = saved_errno;
  104. return -1;
  105. }
  106. if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
  107. {
  108. /* Return -1, but preserve the errno from iconv_close. */
  109. int saved_errno = errno;
  110. if (cd->cd != (iconv_t)(-1))
  111. iconv_close (cd->cd);
  112. errno = saved_errno;
  113. return -1;
  114. }
  115. if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
  116. return -1;
  117. return 0;
  118. }
  119. /* iconv_carefully is like iconv, except that it stops as soon as it encounters
  120. a conversion error, and it returns in *INCREMENTED a boolean telling whether
  121. it has incremented the input pointers past the error location. */
  122. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  123. /* Irix iconv() inserts a NUL byte if it cannot convert.
  124. NetBSD iconv() inserts a question mark if it cannot convert.
  125. Only GNU libiconv and GNU libc are known to prefer to fail rather
  126. than doing a lossy conversion. */
  127. static size_t
  128. iconv_carefully (iconv_t cd,
  129. const char **inbuf, size_t *inbytesleft,
  130. char **outbuf, size_t *outbytesleft,
  131. bool *incremented)
  132. {
  133. const char *inptr = *inbuf;
  134. const char *inptr_end = inptr + *inbytesleft;
  135. char *outptr = *outbuf;
  136. size_t outsize = *outbytesleft;
  137. const char *inptr_before;
  138. size_t res;
  139. do
  140. {
  141. size_t insize;
  142. inptr_before = inptr;
  143. res = (size_t)(-1);
  144. for (insize = 1; inptr + insize <= inptr_end; insize++)
  145. {
  146. res = iconv (cd,
  147. (ICONV_CONST char **) &inptr, &insize,
  148. &outptr, &outsize);
  149. if (!(res == (size_t)(-1) && errno == EINVAL))
  150. break;
  151. /* iconv can eat up a shift sequence but give EINVAL while attempting
  152. to convert the first character. E.g. libiconv does this. */
  153. if (inptr > inptr_before)
  154. {
  155. res = 0;
  156. break;
  157. }
  158. }
  159. if (res == 0)
  160. {
  161. *outbuf = outptr;
  162. *outbytesleft = outsize;
  163. }
  164. }
  165. while (res == 0 && inptr < inptr_end);
  166. *inbuf = inptr;
  167. *inbytesleft = inptr_end - inptr;
  168. if (res != (size_t)(-1) && res > 0)
  169. {
  170. /* iconv() has already incremented INPTR. We cannot go back to a
  171. previous INPTR, otherwise the state inside CD would become invalid,
  172. if FROM_CODESET is a stateful encoding. So, tell the caller that
  173. *INBUF has already been incremented. */
  174. *incremented = (inptr > inptr_before);
  175. errno = EILSEQ;
  176. return (size_t)(-1);
  177. }
  178. else
  179. {
  180. *incremented = false;
  181. return res;
  182. }
  183. }
  184. # else
  185. # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
  186. (*(incremented) = false, \
  187. iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
  188. # endif
  189. /* iconv_carefully_1 is like iconv_carefully, except that it stops after
  190. converting one character or one shift sequence. */
  191. static size_t
  192. iconv_carefully_1 (iconv_t cd,
  193. const char **inbuf, size_t *inbytesleft,
  194. char **outbuf, size_t *outbytesleft,
  195. bool *incremented)
  196. {
  197. const char *inptr_before = *inbuf;
  198. const char *inptr = inptr_before;
  199. const char *inptr_end = inptr_before + *inbytesleft;
  200. char *outptr = *outbuf;
  201. size_t outsize = *outbytesleft;
  202. size_t res = (size_t)(-1);
  203. size_t insize;
  204. for (insize = 1; inptr_before + insize <= inptr_end; insize++)
  205. {
  206. inptr = inptr_before;
  207. res = iconv (cd,
  208. (ICONV_CONST char **) &inptr, &insize,
  209. &outptr, &outsize);
  210. if (!(res == (size_t)(-1) && errno == EINVAL))
  211. break;
  212. /* iconv can eat up a shift sequence but give EINVAL while attempting
  213. to convert the first character. E.g. libiconv does this. */
  214. if (inptr > inptr_before)
  215. {
  216. res = 0;
  217. break;
  218. }
  219. }
  220. *inbuf = inptr;
  221. *inbytesleft = inptr_end - inptr;
  222. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  223. /* Irix iconv() inserts a NUL byte if it cannot convert.
  224. NetBSD iconv() inserts a question mark if it cannot convert.
  225. Only GNU libiconv and GNU libc are known to prefer to fail rather
  226. than doing a lossy conversion. */
  227. if (res != (size_t)(-1) && res > 0)
  228. {
  229. /* iconv() has already incremented INPTR. We cannot go back to a
  230. previous INPTR, otherwise the state inside CD would become invalid,
  231. if FROM_CODESET is a stateful encoding. So, tell the caller that
  232. *INBUF has already been incremented. */
  233. *incremented = (inptr > inptr_before);
  234. errno = EILSEQ;
  235. return (size_t)(-1);
  236. }
  237. # endif
  238. if (res != (size_t)(-1))
  239. {
  240. *outbuf = outptr;
  241. *outbytesleft = outsize;
  242. }
  243. *incremented = false;
  244. return res;
  245. }
  246. /* utf8conv_carefully is like iconv, except that
  247. - it converts from UTF-8 to UTF-8,
  248. - it stops as soon as it encounters a conversion error, and it returns
  249. in *INCREMENTED a boolean telling whether it has incremented the input
  250. pointers past the error location,
  251. - if one_character_only is true, it stops after converting one
  252. character. */
  253. static size_t
  254. utf8conv_carefully (bool one_character_only,
  255. const char **inbuf, size_t *inbytesleft,
  256. char **outbuf, size_t *outbytesleft,
  257. bool *incremented)
  258. {
  259. const char *inptr = *inbuf;
  260. size_t insize = *inbytesleft;
  261. char *outptr = *outbuf;
  262. size_t outsize = *outbytesleft;
  263. size_t res;
  264. res = 0;
  265. do
  266. {
  267. ucs4_t uc;
  268. int n;
  269. int m;
  270. n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
  271. if (n < 0)
  272. {
  273. errno = (n == -2 ? EINVAL : EILSEQ);
  274. n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
  275. inptr += n;
  276. insize -= n;
  277. res = (size_t)(-1);
  278. *incremented = true;
  279. break;
  280. }
  281. if (outsize == 0)
  282. {
  283. errno = E2BIG;
  284. res = (size_t)(-1);
  285. *incremented = false;
  286. break;
  287. }
  288. m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
  289. if (m == -2)
  290. {
  291. errno = E2BIG;
  292. res = (size_t)(-1);
  293. *incremented = false;
  294. break;
  295. }
  296. inptr += n;
  297. insize -= n;
  298. if (m == -1)
  299. {
  300. errno = EILSEQ;
  301. res = (size_t)(-1);
  302. *incremented = true;
  303. break;
  304. }
  305. outptr += m;
  306. outsize -= m;
  307. }
  308. while (!one_character_only && insize > 0);
  309. *inbuf = inptr;
  310. *inbytesleft = insize;
  311. *outbuf = outptr;
  312. *outbytesleft = outsize;
  313. return res;
  314. }
  315. static int
  316. mem_cd_iconveh_internal (const char *src, size_t srclen,
  317. iconv_t cd, iconv_t cd1, iconv_t cd2,
  318. enum iconv_ilseq_handler handler,
  319. size_t extra_alloc,
  320. size_t *offsets,
  321. char **resultp, size_t *lengthp)
  322. {
  323. /* When a conversion error occurs, we cannot start using CD1 and CD2 at
  324. this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
  325. Instead, we have to start afresh from the beginning of SRC. */
  326. /* Use a temporary buffer, so that for small strings, a single malloc()
  327. call will be sufficient. */
  328. # define tmpbufsize 4096
  329. /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
  330. libiconv's UCS-4-INTERNAL encoding. */
  331. union { unsigned int align; char buf[tmpbufsize]; } tmp;
  332. # define tmpbuf tmp.buf
  333. char *initial_result;
  334. char *result;
  335. size_t allocated;
  336. size_t length;
  337. size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
  338. if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
  339. {
  340. initial_result = *resultp;
  341. allocated = *lengthp;
  342. }
  343. else
  344. {
  345. initial_result = tmpbuf;
  346. allocated = sizeof (tmpbuf);
  347. }
  348. result = initial_result;
  349. /* Test whether a direct conversion is possible at all. */
  350. if (cd == (iconv_t)(-1))
  351. goto indirectly;
  352. if (offsets != NULL)
  353. {
  354. size_t i;
  355. for (i = 0; i < srclen; i++)
  356. offsets[i] = (size_t)(-1);
  357. last_length = (size_t)(-1);
  358. }
  359. length = 0;
  360. /* First, try a direct conversion, and see whether a conversion error
  361. occurs at all. */
  362. {
  363. const char *inptr = src;
  364. size_t insize = srclen;
  365. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  366. # if defined _LIBICONV_VERSION \
  367. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  368. || defined __sun)
  369. /* Set to the initial state. */
  370. iconv (cd, NULL, NULL, NULL, NULL);
  371. # endif
  372. while (insize > 0)
  373. {
  374. char *outptr = result + length;
  375. size_t outsize = allocated - extra_alloc - length;
  376. bool incremented;
  377. size_t res;
  378. bool grow;
  379. if (offsets != NULL)
  380. {
  381. if (length != last_length) /* ensure that offset[] be increasing */
  382. {
  383. offsets[inptr - src] = length;
  384. last_length = length;
  385. }
  386. res = iconv_carefully_1 (cd,
  387. &inptr, &insize,
  388. &outptr, &outsize,
  389. &incremented);
  390. }
  391. else
  392. /* Use iconv_carefully instead of iconv here, because:
  393. - If TO_CODESET is UTF-8, we can do the error handling in this
  394. loop, no need for a second loop,
  395. - With iconv() implementations other than GNU libiconv and GNU
  396. libc, if we use iconv() in a big swoop, checking for an E2BIG
  397. return, we lose the number of irreversible conversions. */
  398. res = iconv_carefully (cd,
  399. &inptr, &insize,
  400. &outptr, &outsize,
  401. &incremented);
  402. length = outptr - result;
  403. grow = (length + extra_alloc > allocated / 2);
  404. if (res == (size_t)(-1))
  405. {
  406. if (errno == E2BIG)
  407. grow = true;
  408. else if (errno == EINVAL)
  409. break;
  410. else if (errno == EILSEQ && handler != iconveh_error)
  411. {
  412. if (cd2 == (iconv_t)(-1))
  413. {
  414. /* TO_CODESET is UTF-8. */
  415. /* Error handling can produce up to 1 or 3 bytes of
  416. output. */
  417. size_t extra_need =
  418. (handler == iconveh_replacement_character ? 3 : 1);
  419. if (length + extra_need + extra_alloc > allocated)
  420. {
  421. char *memory;
  422. allocated = 2 * allocated;
  423. if (length + extra_need + extra_alloc > allocated)
  424. allocated = 2 * allocated;
  425. if (length + extra_need + extra_alloc > allocated)
  426. abort ();
  427. if (result == initial_result)
  428. memory = (char *) malloc (allocated);
  429. else
  430. memory = (char *) realloc (result, allocated);
  431. if (memory == NULL)
  432. {
  433. if (result != initial_result)
  434. free (result);
  435. errno = ENOMEM;
  436. return -1;
  437. }
  438. if (result == initial_result)
  439. memcpy (memory, initial_result, length);
  440. result = memory;
  441. grow = false;
  442. }
  443. /* The input is invalid in FROM_CODESET. Eat up one byte
  444. and emit a replacement character or a question mark. */
  445. if (!incremented)
  446. {
  447. if (insize == 0)
  448. abort ();
  449. inptr++;
  450. insize--;
  451. }
  452. if (handler == iconveh_replacement_character)
  453. {
  454. /* U+FFFD in UTF-8 encoding. */
  455. result[length+0] = '\357';
  456. result[length+1] = '\277';
  457. result[length+2] = '\275';
  458. length += 3;
  459. }
  460. else
  461. {
  462. result[length] = '?';
  463. length++;
  464. }
  465. }
  466. else
  467. goto indirectly;
  468. }
  469. else
  470. {
  471. if (result != initial_result)
  472. free (result);
  473. return -1;
  474. }
  475. }
  476. if (insize == 0)
  477. break;
  478. if (grow)
  479. {
  480. char *memory;
  481. allocated = 2 * allocated;
  482. if (result == initial_result)
  483. memory = (char *) malloc (allocated);
  484. else
  485. memory = (char *) realloc (result, allocated);
  486. if (memory == NULL)
  487. {
  488. if (result != initial_result)
  489. free (result);
  490. errno = ENOMEM;
  491. return -1;
  492. }
  493. if (result == initial_result)
  494. memcpy (memory, initial_result, length);
  495. result = memory;
  496. }
  497. }
  498. }
  499. /* Now get the conversion state back to the initial state.
  500. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  501. #if defined _LIBICONV_VERSION \
  502. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  503. || defined __sun)
  504. for (;;)
  505. {
  506. char *outptr = result + length;
  507. size_t outsize = allocated - extra_alloc - length;
  508. size_t res;
  509. res = iconv (cd, NULL, NULL, &outptr, &outsize);
  510. length = outptr - result;
  511. if (res == (size_t)(-1))
  512. {
  513. if (errno == E2BIG)
  514. {
  515. char *memory;
  516. allocated = 2 * allocated;
  517. if (result == initial_result)
  518. memory = (char *) malloc (allocated);
  519. else
  520. memory = (char *) realloc (result, allocated);
  521. if (memory == NULL)
  522. {
  523. if (result != initial_result)
  524. free (result);
  525. errno = ENOMEM;
  526. return -1;
  527. }
  528. if (result == initial_result)
  529. memcpy (memory, initial_result, length);
  530. result = memory;
  531. }
  532. else
  533. {
  534. if (result != initial_result)
  535. free (result);
  536. return -1;
  537. }
  538. }
  539. else
  540. break;
  541. }
  542. #endif
  543. /* The direct conversion succeeded. */
  544. goto done;
  545. indirectly:
  546. /* The direct conversion failed.
  547. Use a conversion through UTF-8. */
  548. if (offsets != NULL)
  549. {
  550. size_t i;
  551. for (i = 0; i < srclen; i++)
  552. offsets[i] = (size_t)(-1);
  553. last_length = (size_t)(-1);
  554. }
  555. length = 0;
  556. {
  557. const bool slowly = (offsets != NULL || handler == iconveh_error);
  558. # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
  559. char utf8buf[utf8bufsize + 3];
  560. size_t utf8len = 0;
  561. const char *in1ptr = src;
  562. size_t in1size = srclen;
  563. bool do_final_flush1 = true;
  564. bool do_final_flush2 = true;
  565. /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
  566. # if defined _LIBICONV_VERSION \
  567. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  568. || defined __sun)
  569. /* Set to the initial state. */
  570. if (cd1 != (iconv_t)(-1))
  571. iconv (cd1, NULL, NULL, NULL, NULL);
  572. if (cd2 != (iconv_t)(-1))
  573. iconv (cd2, NULL, NULL, NULL, NULL);
  574. # endif
  575. while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
  576. {
  577. char *out1ptr = utf8buf + utf8len;
  578. size_t out1size = utf8bufsize - utf8len;
  579. bool incremented1;
  580. size_t res1;
  581. int errno1;
  582. /* Conversion step 1: from FROM_CODESET to UTF-8. */
  583. if (in1size > 0)
  584. {
  585. if (offsets != NULL
  586. && length != last_length) /* ensure that offset[] be increasing */
  587. {
  588. offsets[in1ptr - src] = length;
  589. last_length = length;
  590. }
  591. if (cd1 != (iconv_t)(-1))
  592. {
  593. if (slowly)
  594. res1 = iconv_carefully_1 (cd1,
  595. &in1ptr, &in1size,
  596. &out1ptr, &out1size,
  597. &incremented1);
  598. else
  599. res1 = iconv_carefully (cd1,
  600. &in1ptr, &in1size,
  601. &out1ptr, &out1size,
  602. &incremented1);
  603. }
  604. else
  605. {
  606. /* FROM_CODESET is UTF-8. */
  607. res1 = utf8conv_carefully (slowly,
  608. &in1ptr, &in1size,
  609. &out1ptr, &out1size,
  610. &incremented1);
  611. }
  612. }
  613. else if (do_final_flush1)
  614. {
  615. /* Now get the conversion state of CD1 back to the initial state.
  616. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  617. # if defined _LIBICONV_VERSION \
  618. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  619. || defined __sun)
  620. if (cd1 != (iconv_t)(-1))
  621. res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
  622. else
  623. # endif
  624. res1 = 0;
  625. do_final_flush1 = false;
  626. incremented1 = true;
  627. }
  628. else
  629. {
  630. res1 = 0;
  631. incremented1 = true;
  632. }
  633. if (res1 == (size_t)(-1)
  634. && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
  635. {
  636. if (result != initial_result)
  637. free (result);
  638. return -1;
  639. }
  640. if (res1 == (size_t)(-1)
  641. && errno == EILSEQ && handler != iconveh_error)
  642. {
  643. /* The input is invalid in FROM_CODESET. Eat up one byte and
  644. emit a U+FFFD character or a question mark. Room for this
  645. character was allocated at the end of utf8buf. */
  646. if (!incremented1)
  647. {
  648. if (in1size == 0)
  649. abort ();
  650. in1ptr++;
  651. in1size--;
  652. }
  653. if (handler == iconveh_replacement_character)
  654. {
  655. /* U+FFFD in UTF-8 encoding. */
  656. out1ptr[0] = '\357';
  657. out1ptr[1] = '\277';
  658. out1ptr[2] = '\275';
  659. out1ptr += 3;
  660. }
  661. else
  662. *out1ptr++ = '?';
  663. res1 = 0;
  664. }
  665. errno1 = errno;
  666. utf8len = out1ptr - utf8buf;
  667. if (offsets != NULL
  668. || in1size == 0
  669. || utf8len > utf8bufsize / 2
  670. || (res1 == (size_t)(-1) && errno1 == E2BIG))
  671. {
  672. /* Conversion step 2: from UTF-8 to TO_CODESET. */
  673. const char *in2ptr = utf8buf;
  674. size_t in2size = utf8len;
  675. while (in2size > 0
  676. || (in1size == 0 && !do_final_flush1 && do_final_flush2))
  677. {
  678. char *out2ptr = result + length;
  679. size_t out2size = allocated - extra_alloc - length;
  680. bool incremented2;
  681. size_t res2;
  682. bool grow;
  683. if (in2size > 0)
  684. {
  685. if (cd2 != (iconv_t)(-1))
  686. res2 = iconv_carefully (cd2,
  687. &in2ptr, &in2size,
  688. &out2ptr, &out2size,
  689. &incremented2);
  690. else
  691. /* TO_CODESET is UTF-8. */
  692. res2 = utf8conv_carefully (false,
  693. &in2ptr, &in2size,
  694. &out2ptr, &out2size,
  695. &incremented2);
  696. }
  697. else /* in1size == 0 && !do_final_flush1
  698. && in2size == 0 && do_final_flush2 */
  699. {
  700. /* Now get the conversion state of CD1 back to the initial
  701. state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
  702. # if defined _LIBICONV_VERSION \
  703. || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
  704. || defined __sun)
  705. if (cd2 != (iconv_t)(-1))
  706. res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
  707. else
  708. # endif
  709. res2 = 0;
  710. do_final_flush2 = false;
  711. incremented2 = true;
  712. }
  713. length = out2ptr - result;
  714. grow = (length + extra_alloc > allocated / 2);
  715. if (res2 == (size_t)(-1))
  716. {
  717. if (errno == E2BIG)
  718. grow = true;
  719. else if (errno == EINVAL)
  720. break;
  721. else if (errno == EILSEQ && handler != iconveh_error)
  722. {
  723. /* Error handling can produce up to 10 bytes of UTF-8
  724. output. But TO_CODESET may be UCS-2, UTF-16 or
  725. UCS-4, so use CD2 here as well. */
  726. char scratchbuf[10];
  727. size_t scratchlen;
  728. ucs4_t uc;
  729. const char *inptr;
  730. size_t insize;
  731. size_t res;
  732. if (incremented2)
  733. {
  734. if (u8_prev (&uc, (const uint8_t *) in2ptr,
  735. (const uint8_t *) utf8buf)
  736. == NULL)
  737. abort ();
  738. }
  739. else
  740. {
  741. int n;
  742. if (in2size == 0)
  743. abort ();
  744. n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
  745. in2size);
  746. in2ptr += n;
  747. in2size -= n;
  748. }
  749. if (handler == iconveh_escape_sequence)
  750. {
  751. static char hex[16] = "0123456789ABCDEF";
  752. scratchlen = 0;
  753. scratchbuf[scratchlen++] = '\\';
  754. if (uc < 0x10000)
  755. scratchbuf[scratchlen++] = 'u';
  756. else
  757. {
  758. scratchbuf[scratchlen++] = 'U';
  759. scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
  760. scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
  761. scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
  762. scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
  763. }
  764. scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
  765. scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
  766. scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
  767. scratchbuf[scratchlen++] = hex[uc & 15];
  768. }
  769. else if (handler == iconveh_replacement_character)
  770. {
  771. /* U+FFFD in UTF-8 encoding. */
  772. scratchbuf[0] = '\357';
  773. scratchbuf[1] = '\277';
  774. scratchbuf[2] = '\275';
  775. scratchlen = 3;
  776. }
  777. else
  778. {
  779. scratchbuf[0] = '?';
  780. scratchlen = 1;
  781. }
  782. inptr = scratchbuf;
  783. insize = scratchlen;
  784. if (cd2 != (iconv_t)(-1))
  785. {
  786. char *out2ptr_try = out2ptr;
  787. size_t out2size_try = out2size;
  788. res = iconv (cd2,
  789. (ICONV_CONST char **) &inptr, &insize,
  790. &out2ptr_try, &out2size_try);
  791. if (handler == iconveh_replacement_character
  792. && (res == (size_t)(-1)
  793. ? errno == EILSEQ
  794. /* FreeBSD iconv(), NetBSD iconv(), and
  795. Solaris 11 iconv() insert a '?' if they
  796. cannot convert. This is what we want.
  797. But IRIX iconv() inserts a NUL byte if it
  798. cannot convert.
  799. And musl libc iconv() inserts a '*' if it
  800. cannot convert. */
  801. : (res > 0
  802. && !(out2ptr_try - out2ptr == 1
  803. && *out2ptr == '?'))))
  804. {
  805. /* The iconv() call failed.
  806. U+FFFD can't be converted to TO_CODESET.
  807. Use '?' instead. */
  808. scratchbuf[0] = '?';
  809. scratchlen = 1;
  810. inptr = scratchbuf;
  811. insize = scratchlen;
  812. res = iconv (cd2,
  813. (ICONV_CONST char **) &inptr, &insize,
  814. &out2ptr, &out2size);
  815. }
  816. else
  817. {
  818. /* Accept the results of the iconv() call. */
  819. out2ptr = out2ptr_try;
  820. out2size = out2size_try;
  821. res = 0;
  822. }
  823. }
  824. else
  825. {
  826. /* TO_CODESET is UTF-8. */
  827. if (out2size >= insize)
  828. {
  829. memcpy (out2ptr, inptr, insize);
  830. out2ptr += insize;
  831. out2size -= insize;
  832. inptr += insize;
  833. insize = 0;
  834. res = 0;
  835. }
  836. else
  837. {
  838. errno = E2BIG;
  839. res = (size_t)(-1);
  840. }
  841. }
  842. length = out2ptr - result;
  843. if (res == (size_t)(-1) && errno == E2BIG)
  844. {
  845. char *memory;
  846. allocated = 2 * allocated;
  847. if (length + 1 + extra_alloc > allocated)
  848. abort ();
  849. if (result == initial_result)
  850. memory = (char *) malloc (allocated);
  851. else
  852. memory = (char *) realloc (result, allocated);
  853. if (memory == NULL)
  854. {
  855. if (result != initial_result)
  856. free (result);
  857. errno = ENOMEM;
  858. return -1;
  859. }
  860. if (result == initial_result)
  861. memcpy (memory, initial_result, length);
  862. result = memory;
  863. grow = false;
  864. out2ptr = result + length;
  865. out2size = allocated - extra_alloc - length;
  866. if (cd2 != (iconv_t)(-1))
  867. res = iconv (cd2,
  868. (ICONV_CONST char **) &inptr,
  869. &insize,
  870. &out2ptr, &out2size);
  871. else
  872. {
  873. /* TO_CODESET is UTF-8. */
  874. if (!(out2size >= insize))
  875. abort ();
  876. memcpy (out2ptr, inptr, insize);
  877. out2ptr += insize;
  878. out2size -= insize;
  879. inptr += insize;
  880. insize = 0;
  881. res = 0;
  882. }
  883. length = out2ptr - result;
  884. }
  885. # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
  886. /* IRIX iconv() inserts a NUL byte if it cannot convert.
  887. FreeBSD iconv(), NetBSD iconv(), and Solaris 11
  888. iconv() insert a '?' if they cannot convert.
  889. musl libc iconv() inserts a '*' if it cannot convert.
  890. Only GNU libiconv and GNU libc are known to prefer
  891. to fail rather than doing a lossy conversion. */
  892. if (res != (size_t)(-1) && res > 0)
  893. {
  894. errno = EILSEQ;
  895. res = (size_t)(-1);
  896. }
  897. # endif
  898. if (res == (size_t)(-1))
  899. {
  900. /* Failure converting the ASCII replacement. */
  901. if (result != initial_result)
  902. free (result);
  903. return -1;
  904. }
  905. }
  906. else
  907. {
  908. if (result != initial_result)
  909. free (result);
  910. return -1;
  911. }
  912. }
  913. if (!(in2size > 0
  914. || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
  915. break;
  916. if (grow)
  917. {
  918. char *memory;
  919. allocated = 2 * allocated;
  920. if (result == initial_result)
  921. memory = (char *) malloc (allocated);
  922. else
  923. memory = (char *) realloc (result, allocated);
  924. if (memory == NULL)
  925. {
  926. if (result != initial_result)
  927. free (result);
  928. errno = ENOMEM;
  929. return -1;
  930. }
  931. if (result == initial_result)
  932. memcpy (memory, initial_result, length);
  933. result = memory;
  934. }
  935. }
  936. /* Move the remaining bytes to the beginning of utf8buf. */
  937. if (in2size > 0)
  938. memmove (utf8buf, in2ptr, in2size);
  939. utf8len = in2size;
  940. }
  941. if (res1 == (size_t)(-1))
  942. {
  943. if (errno1 == EINVAL)
  944. in1size = 0;
  945. else if (errno1 == EILSEQ)
  946. {
  947. if (result != initial_result)
  948. free (result);
  949. errno = errno1;
  950. return -1;
  951. }
  952. }
  953. }
  954. # undef utf8bufsize
  955. }
  956. done:
  957. /* Now the final memory allocation. */
  958. if (result == tmpbuf)
  959. {
  960. size_t memsize = length + extra_alloc;
  961. if (*resultp != NULL && *lengthp >= memsize)
  962. result = *resultp;
  963. else
  964. {
  965. char *memory;
  966. memory = (char *) malloc (memsize > 0 ? memsize : 1);
  967. if (memory != NULL)
  968. result = memory;
  969. else
  970. {
  971. errno = ENOMEM;
  972. return -1;
  973. }
  974. }
  975. memcpy (result, tmpbuf, length);
  976. }
  977. else if (result != *resultp && length + extra_alloc < allocated)
  978. {
  979. /* Shrink the allocated memory if possible. */
  980. size_t memsize = length + extra_alloc;
  981. char *memory;
  982. memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
  983. if (memory != NULL)
  984. result = memory;
  985. }
  986. *resultp = result;
  987. *lengthp = length;
  988. return 0;
  989. # undef tmpbuf
  990. # undef tmpbufsize
  991. }
  992. int
  993. mem_cd_iconveh (const char *src, size_t srclen,
  994. const iconveh_t *cd,
  995. enum iconv_ilseq_handler handler,
  996. size_t *offsets,
  997. char **resultp, size_t *lengthp)
  998. {
  999. return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
  1000. handler, 0, offsets, resultp, lengthp);
  1001. }
  1002. char *
  1003. str_cd_iconveh (const char *src,
  1004. const iconveh_t *cd,
  1005. enum iconv_ilseq_handler handler)
  1006. {
  1007. /* For most encodings, a trailing NUL byte in the input will be converted
  1008. to a trailing NUL byte in the output. But not for UTF-7. So that this
  1009. function is usable for UTF-7, we have to exclude the NUL byte from the
  1010. conversion and add it by hand afterwards. */
  1011. char *result = NULL;
  1012. size_t length = 0;
  1013. int retval = mem_cd_iconveh_internal (src, strlen (src),
  1014. cd->cd, cd->cd1, cd->cd2, handler, 1,
  1015. NULL, &result, &length);
  1016. if (retval < 0)
  1017. {
  1018. free (result);
  1019. return NULL;
  1020. }
  1021. /* Add the terminating NUL byte. */
  1022. result[length] = '\0';
  1023. return result;
  1024. }
  1025. #endif
  1026. int
  1027. mem_iconveh (const char *src, size_t srclen,
  1028. const char *from_codeset, const char *to_codeset,
  1029. enum iconv_ilseq_handler handler,
  1030. size_t *offsets,
  1031. char **resultp, size_t *lengthp)
  1032. {
  1033. if (srclen == 0)
  1034. {
  1035. /* Nothing to convert. */
  1036. *lengthp = 0;
  1037. return 0;
  1038. }
  1039. else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
  1040. {
  1041. char *result;
  1042. if (*resultp != NULL && *lengthp >= srclen)
  1043. result = *resultp;
  1044. else
  1045. {
  1046. result = (char *) malloc (srclen);
  1047. if (result == NULL)
  1048. {
  1049. errno = ENOMEM;
  1050. return -1;
  1051. }
  1052. }
  1053. memcpy (result, src, srclen);
  1054. *resultp = result;
  1055. *lengthp = srclen;
  1056. return 0;
  1057. }
  1058. else
  1059. {
  1060. #if HAVE_ICONV
  1061. iconveh_t cd;
  1062. char *result;
  1063. size_t length;
  1064. int retval;
  1065. if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
  1066. return -1;
  1067. result = *resultp;
  1068. length = *lengthp;
  1069. retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
  1070. &result, &length);
  1071. if (retval < 0)
  1072. {
  1073. /* Close cd, but preserve the errno from str_cd_iconv. */
  1074. int saved_errno = errno;
  1075. iconveh_close (&cd);
  1076. errno = saved_errno;
  1077. }
  1078. else
  1079. {
  1080. if (iconveh_close (&cd) < 0)
  1081. {
  1082. if (result != *resultp)
  1083. free (result);
  1084. return -1;
  1085. }
  1086. *resultp = result;
  1087. *lengthp = length;
  1088. }
  1089. return retval;
  1090. #else
  1091. /* This is a different error code than if iconv_open existed but didn't
  1092. support from_codeset and to_codeset, so that the caller can emit
  1093. an error message such as
  1094. "iconv() is not supported. Installing GNU libiconv and
  1095. then reinstalling this package would fix this." */
  1096. errno = ENOSYS;
  1097. return -1;
  1098. #endif
  1099. }
  1100. }
  1101. char *
  1102. str_iconveh (const char *src,
  1103. const char *from_codeset, const char *to_codeset,
  1104. enum iconv_ilseq_handler handler)
  1105. {
  1106. if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
  1107. {
  1108. char *result = strdup (src);
  1109. if (result == NULL)
  1110. errno = ENOMEM;
  1111. return result;
  1112. }
  1113. else
  1114. {
  1115. #if HAVE_ICONV
  1116. iconveh_t cd;
  1117. char *result;
  1118. if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
  1119. return NULL;
  1120. result = str_cd_iconveh (src, &cd, handler);
  1121. if (result == NULL)
  1122. {
  1123. /* Close cd, but preserve the errno from str_cd_iconv. */
  1124. int saved_errno = errno;
  1125. iconveh_close (&cd);
  1126. errno = saved_errno;
  1127. }
  1128. else
  1129. {
  1130. if (iconveh_close (&cd) < 0)
  1131. {
  1132. free (result);
  1133. return NULL;
  1134. }
  1135. }
  1136. return result;
  1137. #else
  1138. /* This is a different error code than if iconv_open existed but didn't
  1139. support from_codeset and to_codeset, so that the caller can emit
  1140. an error message such as
  1141. "iconv() is not supported. Installing GNU libiconv and
  1142. then reinstalling this package would fix this." */
  1143. errno = ENOSYS;
  1144. return NULL;
  1145. #endif
  1146. }
  1147. }