1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237 |
- /* Character set conversion with error handling.
- Copyright (C) 2001-2022 Free Software Foundation, Inc.
- Written by Bruno Haible and Simon Josefsson.
- This file is free software: you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as
- published by the Free Software Foundation; either version 2.1 of the
- License, or (at your option) any later version.
- This file is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>. */
- #include <config.h>
- /* Specification. */
- #include "striconveh.h"
- #include <errno.h>
- #include <stdbool.h>
- #include <stdlib.h>
- #include <string.h>
- #if HAVE_ICONV
- # include <iconv.h>
- # include "unistr.h"
- #endif
- #include "c-strcase.h"
- #include "c-strcaseeq.h"
- #ifndef SIZE_MAX
- # define SIZE_MAX ((size_t) -1)
- #endif
- #if HAVE_ICONV
- /* The caller must provide an iconveh_t, not just an iconv_t, because when a
- conversion error occurs, we may have to determine the Unicode representation
- of the inconvertible character. */
- int
- iconveh_open (const char *to_codeset, const char *from_codeset, iconveh_t *cdp)
- {
- iconv_t cd;
- iconv_t cd1;
- iconv_t cd2;
- /* Avoid glibc-2.1 bug with EUC-KR. */
- # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- && !defined _LIBICONV_VERSION
- if (c_strcasecmp (from_codeset, "EUC-KR") == 0
- || c_strcasecmp (to_codeset, "EUC-KR") == 0)
- {
- errno = EINVAL;
- return -1;
- }
- # endif
- cd = iconv_open (to_codeset, from_codeset);
- if (STRCASEEQ (from_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0))
- cd1 = (iconv_t)(-1);
- else
- {
- cd1 = iconv_open ("UTF-8", from_codeset);
- if (cd1 == (iconv_t)(-1))
- {
- int saved_errno = errno;
- if (cd != (iconv_t)(-1))
- iconv_close (cd);
- errno = saved_errno;
- return -1;
- }
- }
- if (STRCASEEQ (to_codeset, "UTF-8", 'U','T','F','-','8',0,0,0,0)
- # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
- && !defined __UCLIBC__) \
- || _LIBICONV_VERSION >= 0x0105
- || c_strcasecmp (to_codeset, "UTF-8//TRANSLIT") == 0
- # endif
- )
- cd2 = (iconv_t)(-1);
- else
- {
- cd2 = iconv_open (to_codeset, "UTF-8");
- if (cd2 == (iconv_t)(-1))
- {
- int saved_errno = errno;
- if (cd1 != (iconv_t)(-1))
- iconv_close (cd1);
- if (cd != (iconv_t)(-1))
- iconv_close (cd);
- errno = saved_errno;
- return -1;
- }
- }
- cdp->cd = cd;
- cdp->cd1 = cd1;
- cdp->cd2 = cd2;
- return 0;
- }
- int
- iconveh_close (const iconveh_t *cd)
- {
- if (cd->cd2 != (iconv_t)(-1) && iconv_close (cd->cd2) < 0)
- {
- /* Return -1, but preserve the errno from iconv_close. */
- int saved_errno = errno;
- if (cd->cd1 != (iconv_t)(-1))
- iconv_close (cd->cd1);
- if (cd->cd != (iconv_t)(-1))
- iconv_close (cd->cd);
- errno = saved_errno;
- return -1;
- }
- if (cd->cd1 != (iconv_t)(-1) && iconv_close (cd->cd1) < 0)
- {
- /* Return -1, but preserve the errno from iconv_close. */
- int saved_errno = errno;
- if (cd->cd != (iconv_t)(-1))
- iconv_close (cd->cd);
- errno = saved_errno;
- return -1;
- }
- if (cd->cd != (iconv_t)(-1) && iconv_close (cd->cd) < 0)
- return -1;
- return 0;
- }
- /* iconv_carefully is like iconv, except that it stops as soon as it encounters
- a conversion error, and it returns in *INCREMENTED a boolean telling whether
- it has incremented the input pointers past the error location. */
- # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
- /* Irix iconv() inserts a NUL byte if it cannot convert.
- NetBSD iconv() inserts a question mark if it cannot convert.
- Only GNU libiconv and GNU libc are known to prefer to fail rather
- than doing a lossy conversion. */
- static size_t
- iconv_carefully (iconv_t cd,
- const char **inbuf, size_t *inbytesleft,
- char **outbuf, size_t *outbytesleft,
- bool *incremented)
- {
- const char *inptr = *inbuf;
- const char *inptr_end = inptr + *inbytesleft;
- char *outptr = *outbuf;
- size_t outsize = *outbytesleft;
- const char *inptr_before;
- size_t res;
- do
- {
- size_t insize;
- inptr_before = inptr;
- res = (size_t)(-1);
- for (insize = 1; inptr + insize <= inptr_end; insize++)
- {
- res = iconv (cd,
- (ICONV_CONST char **) &inptr, &insize,
- &outptr, &outsize);
- if (!(res == (size_t)(-1) && errno == EINVAL))
- break;
- /* iconv can eat up a shift sequence but give EINVAL while attempting
- to convert the first character. E.g. libiconv does this. */
- if (inptr > inptr_before)
- {
- res = 0;
- break;
- }
- }
- if (res == 0)
- {
- *outbuf = outptr;
- *outbytesleft = outsize;
- }
- }
- while (res == 0 && inptr < inptr_end);
- *inbuf = inptr;
- *inbytesleft = inptr_end - inptr;
- if (res != (size_t)(-1) && res > 0)
- {
- /* iconv() has already incremented INPTR. We cannot go back to a
- previous INPTR, otherwise the state inside CD would become invalid,
- if FROM_CODESET is a stateful encoding. So, tell the caller that
- *INBUF has already been incremented. */
- *incremented = (inptr > inptr_before);
- errno = EILSEQ;
- return (size_t)(-1);
- }
- else
- {
- *incremented = false;
- return res;
- }
- }
- # else
- # define iconv_carefully(cd, inbuf, inbytesleft, outbuf, outbytesleft, incremented) \
- (*(incremented) = false, \
- iconv (cd, (ICONV_CONST char **) (inbuf), inbytesleft, outbuf, outbytesleft))
- # endif
- /* iconv_carefully_1 is like iconv_carefully, except that it stops after
- converting one character or one shift sequence. */
- static size_t
- iconv_carefully_1 (iconv_t cd,
- const char **inbuf, size_t *inbytesleft,
- char **outbuf, size_t *outbytesleft,
- bool *incremented)
- {
- const char *inptr_before = *inbuf;
- const char *inptr = inptr_before;
- const char *inptr_end = inptr_before + *inbytesleft;
- char *outptr = *outbuf;
- size_t outsize = *outbytesleft;
- size_t res = (size_t)(-1);
- size_t insize;
- for (insize = 1; inptr_before + insize <= inptr_end; insize++)
- {
- inptr = inptr_before;
- res = iconv (cd,
- (ICONV_CONST char **) &inptr, &insize,
- &outptr, &outsize);
- if (!(res == (size_t)(-1) && errno == EINVAL))
- break;
- /* iconv can eat up a shift sequence but give EINVAL while attempting
- to convert the first character. E.g. libiconv does this. */
- if (inptr > inptr_before)
- {
- res = 0;
- break;
- }
- }
- *inbuf = inptr;
- *inbytesleft = inptr_end - inptr;
- # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
- /* Irix iconv() inserts a NUL byte if it cannot convert.
- NetBSD iconv() inserts a question mark if it cannot convert.
- Only GNU libiconv and GNU libc are known to prefer to fail rather
- than doing a lossy conversion. */
- if (res != (size_t)(-1) && res > 0)
- {
- /* iconv() has already incremented INPTR. We cannot go back to a
- previous INPTR, otherwise the state inside CD would become invalid,
- if FROM_CODESET is a stateful encoding. So, tell the caller that
- *INBUF has already been incremented. */
- *incremented = (inptr > inptr_before);
- errno = EILSEQ;
- return (size_t)(-1);
- }
- # endif
- if (res != (size_t)(-1))
- {
- *outbuf = outptr;
- *outbytesleft = outsize;
- }
- *incremented = false;
- return res;
- }
- /* utf8conv_carefully is like iconv, except that
- - it converts from UTF-8 to UTF-8,
- - it stops as soon as it encounters a conversion error, and it returns
- in *INCREMENTED a boolean telling whether it has incremented the input
- pointers past the error location,
- - if one_character_only is true, it stops after converting one
- character. */
- static size_t
- utf8conv_carefully (bool one_character_only,
- const char **inbuf, size_t *inbytesleft,
- char **outbuf, size_t *outbytesleft,
- bool *incremented)
- {
- const char *inptr = *inbuf;
- size_t insize = *inbytesleft;
- char *outptr = *outbuf;
- size_t outsize = *outbytesleft;
- size_t res;
- res = 0;
- do
- {
- ucs4_t uc;
- int n;
- int m;
- n = u8_mbtoucr (&uc, (const uint8_t *) inptr, insize);
- if (n < 0)
- {
- errno = (n == -2 ? EINVAL : EILSEQ);
- n = u8_mbtouc (&uc, (const uint8_t *) inptr, insize);
- inptr += n;
- insize -= n;
- res = (size_t)(-1);
- *incremented = true;
- break;
- }
- if (outsize == 0)
- {
- errno = E2BIG;
- res = (size_t)(-1);
- *incremented = false;
- break;
- }
- m = u8_uctomb ((uint8_t *) outptr, uc, outsize);
- if (m == -2)
- {
- errno = E2BIG;
- res = (size_t)(-1);
- *incremented = false;
- break;
- }
- inptr += n;
- insize -= n;
- if (m == -1)
- {
- errno = EILSEQ;
- res = (size_t)(-1);
- *incremented = true;
- break;
- }
- outptr += m;
- outsize -= m;
- }
- while (!one_character_only && insize > 0);
- *inbuf = inptr;
- *inbytesleft = insize;
- *outbuf = outptr;
- *outbytesleft = outsize;
- return res;
- }
- static int
- mem_cd_iconveh_internal (const char *src, size_t srclen,
- iconv_t cd, iconv_t cd1, iconv_t cd2,
- enum iconv_ilseq_handler handler,
- size_t extra_alloc,
- size_t *offsets,
- char **resultp, size_t *lengthp)
- {
- /* When a conversion error occurs, we cannot start using CD1 and CD2 at
- this point: FROM_CODESET may be a stateful encoding like ISO-2022-KR.
- Instead, we have to start afresh from the beginning of SRC. */
- /* Use a temporary buffer, so that for small strings, a single malloc()
- call will be sufficient. */
- # define tmpbufsize 4096
- /* The alignment is needed when converting e.g. to glibc's WCHAR_T or
- libiconv's UCS-4-INTERNAL encoding. */
- union { unsigned int align; char buf[tmpbufsize]; } tmp;
- # define tmpbuf tmp.buf
- char *initial_result;
- char *result;
- size_t allocated;
- size_t length;
- size_t last_length = (size_t)(-1); /* only needed if offsets != NULL */
- if (*resultp != NULL && *lengthp >= sizeof (tmpbuf))
- {
- initial_result = *resultp;
- allocated = *lengthp;
- }
- else
- {
- initial_result = tmpbuf;
- allocated = sizeof (tmpbuf);
- }
- result = initial_result;
- /* Test whether a direct conversion is possible at all. */
- if (cd == (iconv_t)(-1))
- goto indirectly;
- if (offsets != NULL)
- {
- size_t i;
- for (i = 0; i < srclen; i++)
- offsets[i] = (size_t)(-1);
- last_length = (size_t)(-1);
- }
- length = 0;
- /* First, try a direct conversion, and see whether a conversion error
- occurs at all. */
- {
- const char *inptr = src;
- size_t insize = srclen;
- /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
- # if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- /* Set to the initial state. */
- iconv (cd, NULL, NULL, NULL, NULL);
- # endif
- while (insize > 0)
- {
- char *outptr = result + length;
- size_t outsize = allocated - extra_alloc - length;
- bool incremented;
- size_t res;
- bool grow;
- if (offsets != NULL)
- {
- if (length != last_length) /* ensure that offset[] be increasing */
- {
- offsets[inptr - src] = length;
- last_length = length;
- }
- res = iconv_carefully_1 (cd,
- &inptr, &insize,
- &outptr, &outsize,
- &incremented);
- }
- else
- /* Use iconv_carefully instead of iconv here, because:
- - If TO_CODESET is UTF-8, we can do the error handling in this
- loop, no need for a second loop,
- - With iconv() implementations other than GNU libiconv and GNU
- libc, if we use iconv() in a big swoop, checking for an E2BIG
- return, we lose the number of irreversible conversions. */
- res = iconv_carefully (cd,
- &inptr, &insize,
- &outptr, &outsize,
- &incremented);
- length = outptr - result;
- grow = (length + extra_alloc > allocated / 2);
- if (res == (size_t)(-1))
- {
- if (errno == E2BIG)
- grow = true;
- else if (errno == EINVAL)
- break;
- else if (errno == EILSEQ && handler != iconveh_error)
- {
- if (cd2 == (iconv_t)(-1))
- {
- /* TO_CODESET is UTF-8. */
- /* Error handling can produce up to 1 or 3 bytes of
- output. */
- size_t extra_need =
- (handler == iconveh_replacement_character ? 3 : 1);
- if (length + extra_need + extra_alloc > allocated)
- {
- char *memory;
- allocated = 2 * allocated;
- if (length + extra_need + extra_alloc > allocated)
- allocated = 2 * allocated;
- if (length + extra_need + extra_alloc > allocated)
- abort ();
- if (result == initial_result)
- memory = (char *) malloc (allocated);
- else
- memory = (char *) realloc (result, allocated);
- if (memory == NULL)
- {
- if (result != initial_result)
- free (result);
- errno = ENOMEM;
- return -1;
- }
- if (result == initial_result)
- memcpy (memory, initial_result, length);
- result = memory;
- grow = false;
- }
- /* The input is invalid in FROM_CODESET. Eat up one byte
- and emit a replacement character or a question mark. */
- if (!incremented)
- {
- if (insize == 0)
- abort ();
- inptr++;
- insize--;
- }
- if (handler == iconveh_replacement_character)
- {
- /* U+FFFD in UTF-8 encoding. */
- result[length+0] = '\357';
- result[length+1] = '\277';
- result[length+2] = '\275';
- length += 3;
- }
- else
- {
- result[length] = '?';
- length++;
- }
- }
- else
- goto indirectly;
- }
- else
- {
- if (result != initial_result)
- free (result);
- return -1;
- }
- }
- if (insize == 0)
- break;
- if (grow)
- {
- char *memory;
- allocated = 2 * allocated;
- if (result == initial_result)
- memory = (char *) malloc (allocated);
- else
- memory = (char *) realloc (result, allocated);
- if (memory == NULL)
- {
- if (result != initial_result)
- free (result);
- errno = ENOMEM;
- return -1;
- }
- if (result == initial_result)
- memcpy (memory, initial_result, length);
- result = memory;
- }
- }
- }
- /* Now get the conversion state back to the initial state.
- But avoid glibc-2.1 bug and Solaris 2.7 bug. */
- #if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- for (;;)
- {
- char *outptr = result + length;
- size_t outsize = allocated - extra_alloc - length;
- size_t res;
- res = iconv (cd, NULL, NULL, &outptr, &outsize);
- length = outptr - result;
- if (res == (size_t)(-1))
- {
- if (errno == E2BIG)
- {
- char *memory;
- allocated = 2 * allocated;
- if (result == initial_result)
- memory = (char *) malloc (allocated);
- else
- memory = (char *) realloc (result, allocated);
- if (memory == NULL)
- {
- if (result != initial_result)
- free (result);
- errno = ENOMEM;
- return -1;
- }
- if (result == initial_result)
- memcpy (memory, initial_result, length);
- result = memory;
- }
- else
- {
- if (result != initial_result)
- free (result);
- return -1;
- }
- }
- else
- break;
- }
- #endif
- /* The direct conversion succeeded. */
- goto done;
- indirectly:
- /* The direct conversion failed.
- Use a conversion through UTF-8. */
- if (offsets != NULL)
- {
- size_t i;
- for (i = 0; i < srclen; i++)
- offsets[i] = (size_t)(-1);
- last_length = (size_t)(-1);
- }
- length = 0;
- {
- const bool slowly = (offsets != NULL || handler == iconveh_error);
- # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
- char utf8buf[utf8bufsize + 3];
- size_t utf8len = 0;
- const char *in1ptr = src;
- size_t in1size = srclen;
- bool do_final_flush1 = true;
- bool do_final_flush2 = true;
- /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
- # if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- /* Set to the initial state. */
- if (cd1 != (iconv_t)(-1))
- iconv (cd1, NULL, NULL, NULL, NULL);
- if (cd2 != (iconv_t)(-1))
- iconv (cd2, NULL, NULL, NULL, NULL);
- # endif
- while (in1size > 0 || do_final_flush1 || utf8len > 0 || do_final_flush2)
- {
- char *out1ptr = utf8buf + utf8len;
- size_t out1size = utf8bufsize - utf8len;
- bool incremented1;
- size_t res1;
- int errno1;
- /* Conversion step 1: from FROM_CODESET to UTF-8. */
- if (in1size > 0)
- {
- if (offsets != NULL
- && length != last_length) /* ensure that offset[] be increasing */
- {
- offsets[in1ptr - src] = length;
- last_length = length;
- }
- if (cd1 != (iconv_t)(-1))
- {
- if (slowly)
- res1 = iconv_carefully_1 (cd1,
- &in1ptr, &in1size,
- &out1ptr, &out1size,
- &incremented1);
- else
- res1 = iconv_carefully (cd1,
- &in1ptr, &in1size,
- &out1ptr, &out1size,
- &incremented1);
- }
- else
- {
- /* FROM_CODESET is UTF-8. */
- res1 = utf8conv_carefully (slowly,
- &in1ptr, &in1size,
- &out1ptr, &out1size,
- &incremented1);
- }
- }
- else if (do_final_flush1)
- {
- /* Now get the conversion state of CD1 back to the initial state.
- But avoid glibc-2.1 bug and Solaris 2.7 bug. */
- # if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- if (cd1 != (iconv_t)(-1))
- res1 = iconv (cd1, NULL, NULL, &out1ptr, &out1size);
- else
- # endif
- res1 = 0;
- do_final_flush1 = false;
- incremented1 = true;
- }
- else
- {
- res1 = 0;
- incremented1 = true;
- }
- if (res1 == (size_t)(-1)
- && !(errno == E2BIG || errno == EINVAL || errno == EILSEQ))
- {
- if (result != initial_result)
- free (result);
- return -1;
- }
- if (res1 == (size_t)(-1)
- && errno == EILSEQ && handler != iconveh_error)
- {
- /* The input is invalid in FROM_CODESET. Eat up one byte and
- emit a U+FFFD character or a question mark. Room for this
- character was allocated at the end of utf8buf. */
- if (!incremented1)
- {
- if (in1size == 0)
- abort ();
- in1ptr++;
- in1size--;
- }
- if (handler == iconveh_replacement_character)
- {
- /* U+FFFD in UTF-8 encoding. */
- out1ptr[0] = '\357';
- out1ptr[1] = '\277';
- out1ptr[2] = '\275';
- out1ptr += 3;
- }
- else
- *out1ptr++ = '?';
- res1 = 0;
- }
- errno1 = errno;
- utf8len = out1ptr - utf8buf;
- if (offsets != NULL
- || in1size == 0
- || utf8len > utf8bufsize / 2
- || (res1 == (size_t)(-1) && errno1 == E2BIG))
- {
- /* Conversion step 2: from UTF-8 to TO_CODESET. */
- const char *in2ptr = utf8buf;
- size_t in2size = utf8len;
- while (in2size > 0
- || (in1size == 0 && !do_final_flush1 && do_final_flush2))
- {
- char *out2ptr = result + length;
- size_t out2size = allocated - extra_alloc - length;
- bool incremented2;
- size_t res2;
- bool grow;
- if (in2size > 0)
- {
- if (cd2 != (iconv_t)(-1))
- res2 = iconv_carefully (cd2,
- &in2ptr, &in2size,
- &out2ptr, &out2size,
- &incremented2);
- else
- /* TO_CODESET is UTF-8. */
- res2 = utf8conv_carefully (false,
- &in2ptr, &in2size,
- &out2ptr, &out2size,
- &incremented2);
- }
- else /* in1size == 0 && !do_final_flush1
- && in2size == 0 && do_final_flush2 */
- {
- /* Now get the conversion state of CD1 back to the initial
- state. But avoid glibc-2.1 bug and Solaris 2.7 bug. */
- # if defined _LIBICONV_VERSION \
- || !(((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
- || defined __sun)
- if (cd2 != (iconv_t)(-1))
- res2 = iconv (cd2, NULL, NULL, &out2ptr, &out2size);
- else
- # endif
- res2 = 0;
- do_final_flush2 = false;
- incremented2 = true;
- }
- length = out2ptr - result;
- grow = (length + extra_alloc > allocated / 2);
- if (res2 == (size_t)(-1))
- {
- if (errno == E2BIG)
- grow = true;
- else if (errno == EINVAL)
- break;
- else if (errno == EILSEQ && handler != iconveh_error)
- {
- /* Error handling can produce up to 10 bytes of UTF-8
- output. But TO_CODESET may be UCS-2, UTF-16 or
- UCS-4, so use CD2 here as well. */
- char scratchbuf[10];
- size_t scratchlen;
- ucs4_t uc;
- const char *inptr;
- size_t insize;
- size_t res;
- if (incremented2)
- {
- if (u8_prev (&uc, (const uint8_t *) in2ptr,
- (const uint8_t *) utf8buf)
- == NULL)
- abort ();
- }
- else
- {
- int n;
- if (in2size == 0)
- abort ();
- n = u8_mbtouc_unsafe (&uc, (const uint8_t *) in2ptr,
- in2size);
- in2ptr += n;
- in2size -= n;
- }
- if (handler == iconveh_escape_sequence)
- {
- static char hex[16] = "0123456789ABCDEF";
- scratchlen = 0;
- scratchbuf[scratchlen++] = '\\';
- if (uc < 0x10000)
- scratchbuf[scratchlen++] = 'u';
- else
- {
- scratchbuf[scratchlen++] = 'U';
- scratchbuf[scratchlen++] = hex[(uc>>28) & 15];
- scratchbuf[scratchlen++] = hex[(uc>>24) & 15];
- scratchbuf[scratchlen++] = hex[(uc>>20) & 15];
- scratchbuf[scratchlen++] = hex[(uc>>16) & 15];
- }
- scratchbuf[scratchlen++] = hex[(uc>>12) & 15];
- scratchbuf[scratchlen++] = hex[(uc>>8) & 15];
- scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
- scratchbuf[scratchlen++] = hex[uc & 15];
- }
- else if (handler == iconveh_replacement_character)
- {
- /* U+FFFD in UTF-8 encoding. */
- scratchbuf[0] = '\357';
- scratchbuf[1] = '\277';
- scratchbuf[2] = '\275';
- scratchlen = 3;
- }
- else
- {
- scratchbuf[0] = '?';
- scratchlen = 1;
- }
- inptr = scratchbuf;
- insize = scratchlen;
- if (cd2 != (iconv_t)(-1))
- {
- char *out2ptr_try = out2ptr;
- size_t out2size_try = out2size;
- res = iconv (cd2,
- (ICONV_CONST char **) &inptr, &insize,
- &out2ptr_try, &out2size_try);
- if (handler == iconveh_replacement_character
- && (res == (size_t)(-1)
- ? errno == EILSEQ
- /* FreeBSD iconv(), NetBSD iconv(), and
- Solaris 11 iconv() insert a '?' if they
- cannot convert. This is what we want.
- But IRIX iconv() inserts a NUL byte if it
- cannot convert.
- And musl libc iconv() inserts a '*' if it
- cannot convert. */
- : (res > 0
- && !(out2ptr_try - out2ptr == 1
- && *out2ptr == '?'))))
- {
- /* The iconv() call failed.
- U+FFFD can't be converted to TO_CODESET.
- Use '?' instead. */
- scratchbuf[0] = '?';
- scratchlen = 1;
- inptr = scratchbuf;
- insize = scratchlen;
- res = iconv (cd2,
- (ICONV_CONST char **) &inptr, &insize,
- &out2ptr, &out2size);
- }
- else
- {
- /* Accept the results of the iconv() call. */
- out2ptr = out2ptr_try;
- out2size = out2size_try;
- res = 0;
- }
- }
- else
- {
- /* TO_CODESET is UTF-8. */
- if (out2size >= insize)
- {
- memcpy (out2ptr, inptr, insize);
- out2ptr += insize;
- out2size -= insize;
- inptr += insize;
- insize = 0;
- res = 0;
- }
- else
- {
- errno = E2BIG;
- res = (size_t)(-1);
- }
- }
- length = out2ptr - result;
- if (res == (size_t)(-1) && errno == E2BIG)
- {
- char *memory;
- allocated = 2 * allocated;
- if (length + 1 + extra_alloc > allocated)
- abort ();
- if (result == initial_result)
- memory = (char *) malloc (allocated);
- else
- memory = (char *) realloc (result, allocated);
- if (memory == NULL)
- {
- if (result != initial_result)
- free (result);
- errno = ENOMEM;
- return -1;
- }
- if (result == initial_result)
- memcpy (memory, initial_result, length);
- result = memory;
- grow = false;
- out2ptr = result + length;
- out2size = allocated - extra_alloc - length;
- if (cd2 != (iconv_t)(-1))
- res = iconv (cd2,
- (ICONV_CONST char **) &inptr,
- &insize,
- &out2ptr, &out2size);
- else
- {
- /* TO_CODESET is UTF-8. */
- if (!(out2size >= insize))
- abort ();
- memcpy (out2ptr, inptr, insize);
- out2ptr += insize;
- out2size -= insize;
- inptr += insize;
- insize = 0;
- res = 0;
- }
- length = out2ptr - result;
- }
- # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
- /* IRIX iconv() inserts a NUL byte if it cannot convert.
- FreeBSD iconv(), NetBSD iconv(), and Solaris 11
- iconv() insert a '?' if they cannot convert.
- musl libc iconv() inserts a '*' if it cannot convert.
- Only GNU libiconv and GNU libc are known to prefer
- to fail rather than doing a lossy conversion. */
- if (res != (size_t)(-1) && res > 0)
- {
- errno = EILSEQ;
- res = (size_t)(-1);
- }
- # endif
- if (res == (size_t)(-1))
- {
- /* Failure converting the ASCII replacement. */
- if (result != initial_result)
- free (result);
- return -1;
- }
- }
- else
- {
- if (result != initial_result)
- free (result);
- return -1;
- }
- }
- if (!(in2size > 0
- || (in1size == 0 && !do_final_flush1 && do_final_flush2)))
- break;
- if (grow)
- {
- char *memory;
- allocated = 2 * allocated;
- if (result == initial_result)
- memory = (char *) malloc (allocated);
- else
- memory = (char *) realloc (result, allocated);
- if (memory == NULL)
- {
- if (result != initial_result)
- free (result);
- errno = ENOMEM;
- return -1;
- }
- if (result == initial_result)
- memcpy (memory, initial_result, length);
- result = memory;
- }
- }
- /* Move the remaining bytes to the beginning of utf8buf. */
- if (in2size > 0)
- memmove (utf8buf, in2ptr, in2size);
- utf8len = in2size;
- }
- if (res1 == (size_t)(-1))
- {
- if (errno1 == EINVAL)
- in1size = 0;
- else if (errno1 == EILSEQ)
- {
- if (result != initial_result)
- free (result);
- errno = errno1;
- return -1;
- }
- }
- }
- # undef utf8bufsize
- }
- done:
- /* Now the final memory allocation. */
- if (result == tmpbuf)
- {
- size_t memsize = length + extra_alloc;
- if (*resultp != NULL && *lengthp >= memsize)
- result = *resultp;
- else
- {
- char *memory;
- memory = (char *) malloc (memsize > 0 ? memsize : 1);
- if (memory != NULL)
- result = memory;
- else
- {
- errno = ENOMEM;
- return -1;
- }
- }
- memcpy (result, tmpbuf, length);
- }
- else if (result != *resultp && length + extra_alloc < allocated)
- {
- /* Shrink the allocated memory if possible. */
- size_t memsize = length + extra_alloc;
- char *memory;
- memory = (char *) realloc (result, memsize > 0 ? memsize : 1);
- if (memory != NULL)
- result = memory;
- }
- *resultp = result;
- *lengthp = length;
- return 0;
- # undef tmpbuf
- # undef tmpbufsize
- }
- int
- mem_cd_iconveh (const char *src, size_t srclen,
- const iconveh_t *cd,
- enum iconv_ilseq_handler handler,
- size_t *offsets,
- char **resultp, size_t *lengthp)
- {
- return mem_cd_iconveh_internal (src, srclen, cd->cd, cd->cd1, cd->cd2,
- handler, 0, offsets, resultp, lengthp);
- }
- char *
- str_cd_iconveh (const char *src,
- const iconveh_t *cd,
- enum iconv_ilseq_handler handler)
- {
- /* For most encodings, a trailing NUL byte in the input will be converted
- to a trailing NUL byte in the output. But not for UTF-7. So that this
- function is usable for UTF-7, we have to exclude the NUL byte from the
- conversion and add it by hand afterwards. */
- char *result = NULL;
- size_t length = 0;
- int retval = mem_cd_iconveh_internal (src, strlen (src),
- cd->cd, cd->cd1, cd->cd2, handler, 1,
- NULL, &result, &length);
- if (retval < 0)
- {
- free (result);
- return NULL;
- }
- /* Add the terminating NUL byte. */
- result[length] = '\0';
- return result;
- }
- #endif
- int
- mem_iconveh (const char *src, size_t srclen,
- const char *from_codeset, const char *to_codeset,
- enum iconv_ilseq_handler handler,
- size_t *offsets,
- char **resultp, size_t *lengthp)
- {
- if (srclen == 0)
- {
- /* Nothing to convert. */
- *lengthp = 0;
- return 0;
- }
- else if (offsets == NULL && c_strcasecmp (from_codeset, to_codeset) == 0)
- {
- char *result;
- if (*resultp != NULL && *lengthp >= srclen)
- result = *resultp;
- else
- {
- result = (char *) malloc (srclen);
- if (result == NULL)
- {
- errno = ENOMEM;
- return -1;
- }
- }
- memcpy (result, src, srclen);
- *resultp = result;
- *lengthp = srclen;
- return 0;
- }
- else
- {
- #if HAVE_ICONV
- iconveh_t cd;
- char *result;
- size_t length;
- int retval;
- if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
- return -1;
- result = *resultp;
- length = *lengthp;
- retval = mem_cd_iconveh (src, srclen, &cd, handler, offsets,
- &result, &length);
- if (retval < 0)
- {
- /* Close cd, but preserve the errno from str_cd_iconv. */
- int saved_errno = errno;
- iconveh_close (&cd);
- errno = saved_errno;
- }
- else
- {
- if (iconveh_close (&cd) < 0)
- {
- if (result != *resultp)
- free (result);
- return -1;
- }
- *resultp = result;
- *lengthp = length;
- }
- return retval;
- #else
- /* This is a different error code than if iconv_open existed but didn't
- support from_codeset and to_codeset, so that the caller can emit
- an error message such as
- "iconv() is not supported. Installing GNU libiconv and
- then reinstalling this package would fix this." */
- errno = ENOSYS;
- return -1;
- #endif
- }
- }
- char *
- str_iconveh (const char *src,
- const char *from_codeset, const char *to_codeset,
- enum iconv_ilseq_handler handler)
- {
- if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0)
- {
- char *result = strdup (src);
- if (result == NULL)
- errno = ENOMEM;
- return result;
- }
- else
- {
- #if HAVE_ICONV
- iconveh_t cd;
- char *result;
- if (iconveh_open (to_codeset, from_codeset, &cd) < 0)
- return NULL;
- result = str_cd_iconveh (src, &cd, handler);
- if (result == NULL)
- {
- /* Close cd, but preserve the errno from str_cd_iconv. */
- int saved_errno = errno;
- iconveh_close (&cd);
- errno = saved_errno;
- }
- else
- {
- if (iconveh_close (&cd) < 0)
- {
- free (result);
- return NULL;
- }
- }
- return result;
- #else
- /* This is a different error code than if iconv_open existed but didn't
- support from_codeset and to_codeset, so that the caller can emit
- an error message such as
- "iconv() is not supported. Installing GNU libiconv and
- then reinstalling this package would fix this." */
- errno = ENOSYS;
- return NULL;
- #endif
- }
- }
|