1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- ******************************************************************************
- *
- * Copyright (C) 2002-2016, International Business Machines
- * Corporation and others. All Rights Reserved.
- *
- ******************************************************************************
- * file name: ucnvbocu.cpp
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 2002mar27
- * created by: Markus W. Scherer
- *
- * This is an implementation of the Binary Ordered Compression for Unicode,
- * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
- */
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
- #include "unicode/ucnv.h"
- #include "unicode/ucnv_cb.h"
- #include "unicode/utf16.h"
- #include "putilimp.h"
- #include "ucnv_bld.h"
- #include "ucnv_cnv.h"
- #include "uassert.h"
- /* BOCU-1 constants and macros ---------------------------------------------- */
- /*
- * BOCU-1 encodes the code points of a Unicode string as
- * a sequence of byte-encoded differences (slope detection),
- * preserving lexical order.
- *
- * Optimize the difference-taking for runs of Unicode text within
- * small scripts:
- *
- * Most small scripts are allocated within aligned 128-blocks of Unicode
- * code points. Lexical order is preserved if the "previous code point" state
- * is always moved into the middle of such a block.
- *
- * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
- * areas into the middle of those areas.
- *
- * C0 control codes and space are encoded with their US-ASCII bytes.
- * "prev" is reset for C0 controls but not for space.
- */
- /* initial value for "prev": middle of the ASCII range */
- #define BOCU1_ASCII_PREV 0x40
- /* bounding byte values for differences */
- #define BOCU1_MIN 0x21
- #define BOCU1_MIDDLE 0x90
- #define BOCU1_MAX_LEAD 0xfe
- #define BOCU1_MAX_TRAIL 0xff
- #define BOCU1_RESET 0xff
- /* number of lead bytes */
- #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
- /* adjust trail byte counts for the use of some C0 control byte values */
- #define BOCU1_TRAIL_CONTROLS_COUNT 20
- #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
- /* number of trail bytes */
- #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
- /*
- * number of positive and negative single-byte codes
- * (counting 0==BOCU1_MIDDLE among the positive ones)
- */
- #define BOCU1_SINGLE 64
- /* number of lead bytes for positive and negative 2/3/4-byte sequences */
- #define BOCU1_LEAD_2 43
- #define BOCU1_LEAD_3 3
- #define BOCU1_LEAD_4 1
- /* The difference value range for single-byters. */
- #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
- #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
- /* The difference value range for double-byters. */
- #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
- #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
- /* The difference value range for 3-byters. */
- #define BOCU1_REACH_POS_3 \
- (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
- #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
- /* The lead byte start values. */
- #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
- #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
- #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
- /* ==BOCU1_MAX_LEAD */
- #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
- #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
- #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
- /* ==BOCU1_MIN+1 */
- /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
- #define BOCU1_LENGTH_FROM_LEAD(lead) \
- ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
- (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
- (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
- /* The length of a byte sequence, according to its packed form. */
- #define BOCU1_LENGTH_FROM_PACKED(packed) \
- ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
- /*
- * 12 commonly used C0 control codes (and space) are only used to encode
- * themselves directly,
- * which makes BOCU-1 MIME-usable and reasonably safe for
- * ASCII-oriented software.
- *
- * These controls are
- * 0 NUL
- *
- * 7 BEL
- * 8 BS
- *
- * 9 TAB
- * a LF
- * b VT
- * c FF
- * d CR
- *
- * e SO
- * f SI
- *
- * 1a SUB
- * 1b ESC
- *
- * The other 20 C0 controls are also encoded directly (to preserve order)
- * but are also used as trail bytes in difference encoding
- * (for better compression).
- */
- #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
- /*
- * Byte value map for control codes,
- * from external byte values 0x00..0x20
- * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
- * External byte values that are illegal as trail bytes are mapped to -1.
- */
- static const int8_t
- bocu1ByteToTrail[BOCU1_MIN]={
- /* 0 1 2 3 4 5 6 7 */
- -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
- /* 8 9 a b c d e f */
- -1, -1, -1, -1, -1, -1, -1, -1,
- /* 10 11 12 13 14 15 16 17 */
- 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
- /* 18 19 1a 1b 1c 1d 1e 1f */
- 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
- /* 20 */
- -1
- };
- /*
- * Byte value map for control codes,
- * from trail byte values 0..19 (0..0x13) as used in the difference calculation
- * to external byte values 0x00..0x20.
- */
- static const int8_t
- bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
- /* 0 1 2 3 4 5 6 7 */
- 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
- /* 8 9 a b c d e f */
- 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
- /* 10 11 12 13 */
- 0x1c, 0x1d, 0x1e, 0x1f
- };
- /**
- * Integer division and modulo with negative numerators
- * yields negative modulo results and quotients that are one more than
- * what we need here.
- * This macro adjust the results so that the modulo-value m is always >=0.
- *
- * For positive n, the if() condition is always false.
- *
- * @param n Number to be split into quotient and rest.
- * Will be modified to contain the quotient.
- * @param d Divisor.
- * @param m Output variable for the rest (modulo result).
- */
- #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
- (m)=(n)%(d); \
- (n)/=(d); \
- if((m)<0) { \
- --(n); \
- (m)+=(d); \
- } \
- } UPRV_BLOCK_MACRO_END
- /* Faster versions of packDiff() for single-byte-encoded diff values. */
- /** Is a diff value encodable in a single byte? */
- #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
- /** Encode a diff value in a single byte. */
- #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
- /** Is a diff value encodable in two bytes? */
- #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
- /* BOCU-1 implementation functions ------------------------------------------ */
- #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
- /**
- * Compute the next "previous" value for differencing
- * from the current code point.
- *
- * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
- * @return "previous code point" state value
- */
- static inline int32_t
- bocu1Prev(int32_t c) {
- /* compute new prev */
- if(/* 0x3040<=c && */ c<=0x309f) {
- /* Hiragana is not 128-aligned */
- return 0x3070;
- } else if(0x4e00<=c && c<=0x9fa5) {
- /* CJK Unihan */
- return 0x4e00-BOCU1_REACH_NEG_2;
- } else if(0xac00<=c /* && c<=0xd7a3 */) {
- /* Korean Hangul */
- return (0xd7a3+0xac00)/2;
- } else {
- /* mostly small scripts */
- return BOCU1_SIMPLE_PREV(c);
- }
- }
- /** Fast version of bocu1Prev() for most scripts. */
- #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
- /*
- * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
- * The UConverter fields are used as follows:
- *
- * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
- *
- * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
- * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
- */
- /* BOCU-1-from-Unicode conversion functions --------------------------------- */
- /**
- * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
- * and return a packed integer with them.
- *
- * The encoding favors small absolute differences with short encodings
- * to compress runs of same-script characters.
- *
- * Optimized version with unrolled loops and fewer floating-point operations
- * than the standard packDiff().
- *
- * @param diff difference value -0x10ffff..0x10ffff
- * @return
- * 0x010000zz for 1-byte sequence zz
- * 0x0200yyzz for 2-byte sequence yy zz
- * 0x03xxyyzz for 3-byte sequence xx yy zz
- * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
- */
- static int32_t
- packDiff(int32_t diff) {
- int32_t result, m;
- U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
- if(diff>=BOCU1_REACH_NEG_1) {
- /* mostly positive differences, and single-byte negative ones */
- #if 0 /* single-byte case handled in macros, see below */
- if(diff<=BOCU1_REACH_POS_1) {
- /* single byte */
- return 0x01000000|(BOCU1_MIDDLE+diff);
- } else
- #endif
- if(diff<=BOCU1_REACH_POS_2) {
- /* two bytes */
- diff-=BOCU1_REACH_POS_1+1;
- result=0x02000000;
- m=diff%BOCU1_TRAIL_COUNT;
- diff/=BOCU1_TRAIL_COUNT;
- result|=BOCU1_TRAIL_TO_BYTE(m);
- result|=(BOCU1_START_POS_2+diff)<<8;
- } else if(diff<=BOCU1_REACH_POS_3) {
- /* three bytes */
- diff-=BOCU1_REACH_POS_2+1;
- result=0x03000000;
- m=diff%BOCU1_TRAIL_COUNT;
- diff/=BOCU1_TRAIL_COUNT;
- result|=BOCU1_TRAIL_TO_BYTE(m);
- m=diff%BOCU1_TRAIL_COUNT;
- diff/=BOCU1_TRAIL_COUNT;
- result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
- result|=(BOCU1_START_POS_3+diff)<<16;
- } else {
- /* four bytes */
- diff-=BOCU1_REACH_POS_3+1;
- m=diff%BOCU1_TRAIL_COUNT;
- diff/=BOCU1_TRAIL_COUNT;
- result=BOCU1_TRAIL_TO_BYTE(m);
- m=diff%BOCU1_TRAIL_COUNT;
- diff/=BOCU1_TRAIL_COUNT;
- result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
- /*
- * We know that / and % would deliver quotient 0 and rest=diff.
- * Avoid division and modulo for performance.
- */
- result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
- result|=((uint32_t)BOCU1_START_POS_4)<<24;
- }
- } else {
- /* two- to four-byte negative differences */
- if(diff>=BOCU1_REACH_NEG_2) {
- /* two bytes */
- diff-=BOCU1_REACH_NEG_1;
- result=0x02000000;
- NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
- result|=BOCU1_TRAIL_TO_BYTE(m);
- result|=(BOCU1_START_NEG_2+diff)<<8;
- } else if(diff>=BOCU1_REACH_NEG_3) {
- /* three bytes */
- diff-=BOCU1_REACH_NEG_2;
- result=0x03000000;
- NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
- result|=BOCU1_TRAIL_TO_BYTE(m);
- NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
- result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
- result|=(BOCU1_START_NEG_3+diff)<<16;
- } else {
- /* four bytes */
- diff-=BOCU1_REACH_NEG_3;
- NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
- result=BOCU1_TRAIL_TO_BYTE(m);
- NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
- result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
- /*
- * We know that NEGDIVMOD would deliver
- * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
- * Avoid division and modulo for performance.
- */
- m=diff+BOCU1_TRAIL_COUNT;
- result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
- result|=BOCU1_MIN<<24;
- }
- }
- return result;
- }
- static void U_CALLCONV
- _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- UConverter *cnv;
- const char16_t *source, *sourceLimit;
- uint8_t *target;
- int32_t targetCapacity;
- int32_t *offsets;
- int32_t prev, c, diff;
- int32_t sourceIndex, nextSourceIndex;
- /* set up the local pointers */
- cnv=pArgs->converter;
- source=pArgs->source;
- sourceLimit=pArgs->sourceLimit;
- target=(uint8_t *)pArgs->target;
- targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
- offsets=pArgs->offsets;
- /* get the converter state from UConverter */
- c=cnv->fromUChar32;
- prev=(int32_t)cnv->fromUnicodeStatus;
- if(prev==0) {
- prev=BOCU1_ASCII_PREV;
- }
- /* sourceIndex=-1 if the current character began in the previous buffer */
- sourceIndex= c==0 ? 0 : -1;
- nextSourceIndex=0;
- /* conversion loop */
- if(c!=0 && targetCapacity>0) {
- goto getTrail;
- }
- fastSingle:
- /* fast loop for single-byte differences */
- /* use only one loop counter variable, targetCapacity, not also source */
- diff=(int32_t)(sourceLimit-source);
- if(targetCapacity>diff) {
- targetCapacity=diff;
- }
- while(targetCapacity>0 && (c=*source)<0x3000) {
- if(c<=0x20) {
- if(c!=0x20) {
- prev=BOCU1_ASCII_PREV;
- }
- *target++=(uint8_t)c;
- *offsets++=nextSourceIndex++;
- ++source;
- --targetCapacity;
- } else {
- diff=c-prev;
- if(DIFF_IS_SINGLE(diff)) {
- prev=BOCU1_SIMPLE_PREV(c);
- *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
- *offsets++=nextSourceIndex++;
- ++source;
- --targetCapacity;
- } else {
- break;
- }
- }
- }
- /* restore real values */
- targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
- sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
- /* regular loop for all cases */
- while(source<sourceLimit) {
- if(targetCapacity>0) {
- c=*source++;
- ++nextSourceIndex;
- if(c<=0x20) {
- /*
- * ISO C0 control & space:
- * Encode directly for MIME compatibility,
- * and reset state except for space, to not disrupt compression.
- */
- if(c!=0x20) {
- prev=BOCU1_ASCII_PREV;
- }
- *target++=(uint8_t)c;
- *offsets++=sourceIndex;
- --targetCapacity;
- sourceIndex=nextSourceIndex;
- continue;
- }
- if(U16_IS_LEAD(c)) {
- getTrail:
- if(source<sourceLimit) {
- /* test the following code unit */
- char16_t trail=*source;
- if(U16_IS_TRAIL(trail)) {
- ++source;
- ++nextSourceIndex;
- c=U16_GET_SUPPLEMENTARY(c, trail);
- }
- } else {
- /* no more input */
- c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
- break;
- }
- }
- /*
- * all other Unicode code points c==U+0021..U+10ffff
- * are encoded with the difference c-prev
- *
- * a new prev is computed from c,
- * placed in the middle of a 0x80-block (for most small scripts) or
- * in the middle of the Unihan and Hangul blocks
- * to statistically minimize the following difference
- */
- diff=c-prev;
- prev=BOCU1_PREV(c);
- if(DIFF_IS_SINGLE(diff)) {
- *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
- *offsets++=sourceIndex;
- --targetCapacity;
- sourceIndex=nextSourceIndex;
- if(c<0x3000) {
- goto fastSingle;
- }
- } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
- /* optimize 2-byte case */
- int32_t m;
- if(diff>=0) {
- diff-=BOCU1_REACH_POS_1+1;
- m=diff%BOCU1_TRAIL_COUNT;
- diff/=BOCU1_TRAIL_COUNT;
- diff+=BOCU1_START_POS_2;
- } else {
- diff-=BOCU1_REACH_NEG_1;
- NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
- diff+=BOCU1_START_NEG_2;
- }
- *target++=(uint8_t)diff;
- *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
- *offsets++=sourceIndex;
- *offsets++=sourceIndex;
- targetCapacity-=2;
- sourceIndex=nextSourceIndex;
- } else {
- int32_t length; /* will be 2..4 */
- diff=packDiff(diff);
- length=BOCU1_LENGTH_FROM_PACKED(diff);
- /* write the output character bytes from diff and length */
- /* from the first if in the loop we know that targetCapacity>0 */
- if(length<=targetCapacity) {
- switch(length) {
- /* each branch falls through to the next one */
- case 4:
- *target++=(uint8_t)(diff>>24);
- *offsets++=sourceIndex;
- U_FALLTHROUGH;
- case 3:
- *target++=(uint8_t)(diff>>16);
- *offsets++=sourceIndex;
- U_FALLTHROUGH;
- case 2:
- *target++=(uint8_t)(diff>>8);
- *offsets++=sourceIndex;
- /* case 1: handled above */
- *target++=(uint8_t)diff;
- *offsets++=sourceIndex;
- U_FALLTHROUGH;
- default:
- /* will never occur */
- break;
- }
- targetCapacity-=length;
- sourceIndex=nextSourceIndex;
- } else {
- uint8_t *charErrorBuffer;
- /*
- * We actually do this backwards here:
- * In order to save an intermediate variable, we output
- * first to the overflow buffer what does not fit into the
- * regular target.
- */
- /* we know that 1<=targetCapacity<length<=4 */
- length-=targetCapacity;
- charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
- switch(length) {
- /* each branch falls through to the next one */
- case 3:
- *charErrorBuffer++=(uint8_t)(diff>>16);
- U_FALLTHROUGH;
- case 2:
- *charErrorBuffer++=(uint8_t)(diff>>8);
- U_FALLTHROUGH;
- case 1:
- *charErrorBuffer=(uint8_t)diff;
- U_FALLTHROUGH;
- default:
- /* will never occur */
- break;
- }
- cnv->charErrorBufferLength=(int8_t)length;
- /* now output what fits into the regular target */
- diff>>=8*length; /* length was reduced by targetCapacity */
- switch(targetCapacity) {
- /* each branch falls through to the next one */
- case 3:
- *target++=(uint8_t)(diff>>16);
- *offsets++=sourceIndex;
- U_FALLTHROUGH;
- case 2:
- *target++=(uint8_t)(diff>>8);
- *offsets++=sourceIndex;
- U_FALLTHROUGH;
- case 1:
- *target++=(uint8_t)diff;
- *offsets++=sourceIndex;
- U_FALLTHROUGH;
- default:
- /* will never occur */
- break;
- }
- /* target overflow */
- targetCapacity=0;
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }
- } else {
- /* target is full */
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }
- /* set the converter state back into UConverter */
- cnv->fromUChar32= c<0 ? -c : 0;
- cnv->fromUnicodeStatus=(uint32_t)prev;
- /* write back the updated pointers */
- pArgs->source=source;
- pArgs->target=(char *)target;
- pArgs->offsets=offsets;
- }
- /*
- * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
- * If a change is made in the original function, then either
- * change this function the same way or
- * re-copy the original function and remove the variables
- * offsets, sourceIndex, and nextSourceIndex.
- */
- static void U_CALLCONV
- _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- UConverter *cnv;
- const char16_t *source, *sourceLimit;
- uint8_t *target;
- int32_t targetCapacity;
- int32_t prev, c, diff;
- /* set up the local pointers */
- cnv=pArgs->converter;
- source=pArgs->source;
- sourceLimit=pArgs->sourceLimit;
- target=(uint8_t *)pArgs->target;
- targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
- /* get the converter state from UConverter */
- c=cnv->fromUChar32;
- prev=(int32_t)cnv->fromUnicodeStatus;
- if(prev==0) {
- prev=BOCU1_ASCII_PREV;
- }
- /* conversion loop */
- if(c!=0 && targetCapacity>0) {
- goto getTrail;
- }
- fastSingle:
- /* fast loop for single-byte differences */
- /* use only one loop counter variable, targetCapacity, not also source */
- diff=(int32_t)(sourceLimit-source);
- if(targetCapacity>diff) {
- targetCapacity=diff;
- }
- while(targetCapacity>0 && (c=*source)<0x3000) {
- if(c<=0x20) {
- if(c!=0x20) {
- prev=BOCU1_ASCII_PREV;
- }
- *target++=(uint8_t)c;
- } else {
- diff=c-prev;
- if(DIFF_IS_SINGLE(diff)) {
- prev=BOCU1_SIMPLE_PREV(c);
- *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
- } else {
- break;
- }
- }
- ++source;
- --targetCapacity;
- }
- /* restore real values */
- targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
- /* regular loop for all cases */
- while(source<sourceLimit) {
- if(targetCapacity>0) {
- c=*source++;
- if(c<=0x20) {
- /*
- * ISO C0 control & space:
- * Encode directly for MIME compatibility,
- * and reset state except for space, to not disrupt compression.
- */
- if(c!=0x20) {
- prev=BOCU1_ASCII_PREV;
- }
- *target++=(uint8_t)c;
- --targetCapacity;
- continue;
- }
- if(U16_IS_LEAD(c)) {
- getTrail:
- if(source<sourceLimit) {
- /* test the following code unit */
- char16_t trail=*source;
- if(U16_IS_TRAIL(trail)) {
- ++source;
- c=U16_GET_SUPPLEMENTARY(c, trail);
- }
- } else {
- /* no more input */
- c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
- break;
- }
- }
- /*
- * all other Unicode code points c==U+0021..U+10ffff
- * are encoded with the difference c-prev
- *
- * a new prev is computed from c,
- * placed in the middle of a 0x80-block (for most small scripts) or
- * in the middle of the Unihan and Hangul blocks
- * to statistically minimize the following difference
- */
- diff=c-prev;
- prev=BOCU1_PREV(c);
- if(DIFF_IS_SINGLE(diff)) {
- *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
- --targetCapacity;
- if(c<0x3000) {
- goto fastSingle;
- }
- } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
- /* optimize 2-byte case */
- int32_t m;
- if(diff>=0) {
- diff-=BOCU1_REACH_POS_1+1;
- m=diff%BOCU1_TRAIL_COUNT;
- diff/=BOCU1_TRAIL_COUNT;
- diff+=BOCU1_START_POS_2;
- } else {
- diff-=BOCU1_REACH_NEG_1;
- NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
- diff+=BOCU1_START_NEG_2;
- }
- *target++=(uint8_t)diff;
- *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
- targetCapacity-=2;
- } else {
- int32_t length; /* will be 2..4 */
- diff=packDiff(diff);
- length=BOCU1_LENGTH_FROM_PACKED(diff);
- /* write the output character bytes from diff and length */
- /* from the first if in the loop we know that targetCapacity>0 */
- if(length<=targetCapacity) {
- switch(length) {
- /* each branch falls through to the next one */
- case 4:
- *target++=(uint8_t)(diff>>24);
- U_FALLTHROUGH;
- case 3:
- *target++=(uint8_t)(diff>>16);
- /* case 2: handled above */
- *target++=(uint8_t)(diff>>8);
- /* case 1: handled above */
- *target++=(uint8_t)diff;
- U_FALLTHROUGH;
- default:
- /* will never occur */
- break;
- }
- targetCapacity-=length;
- } else {
- uint8_t *charErrorBuffer;
- /*
- * We actually do this backwards here:
- * In order to save an intermediate variable, we output
- * first to the overflow buffer what does not fit into the
- * regular target.
- */
- /* we know that 1<=targetCapacity<length<=4 */
- length-=targetCapacity;
- charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
- switch(length) {
- /* each branch falls through to the next one */
- case 3:
- *charErrorBuffer++=(uint8_t)(diff>>16);
- U_FALLTHROUGH;
- case 2:
- *charErrorBuffer++=(uint8_t)(diff>>8);
- U_FALLTHROUGH;
- case 1:
- *charErrorBuffer=(uint8_t)diff;
- U_FALLTHROUGH;
- default:
- /* will never occur */
- break;
- }
- cnv->charErrorBufferLength=(int8_t)length;
- /* now output what fits into the regular target */
- diff>>=8*length; /* length was reduced by targetCapacity */
- switch(targetCapacity) {
- /* each branch falls through to the next one */
- case 3:
- *target++=(uint8_t)(diff>>16);
- U_FALLTHROUGH;
- case 2:
- *target++=(uint8_t)(diff>>8);
- U_FALLTHROUGH;
- case 1:
- *target++=(uint8_t)diff;
- U_FALLTHROUGH;
- default:
- /* will never occur */
- break;
- }
- /* target overflow */
- targetCapacity=0;
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }
- } else {
- /* target is full */
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }
- /* set the converter state back into UConverter */
- cnv->fromUChar32= c<0 ? -c : 0;
- cnv->fromUnicodeStatus=(uint32_t)prev;
- /* write back the updated pointers */
- pArgs->source=source;
- pArgs->target=(char *)target;
- }
- /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
- /**
- * Function for BOCU-1 decoder; handles multi-byte lead bytes.
- *
- * @param b lead byte;
- * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
- * @return (diff<<2)|count
- */
- static inline int32_t
- decodeBocu1LeadByte(int32_t b) {
- int32_t diff, count;
- if(b>=BOCU1_START_NEG_2) {
- /* positive difference */
- if(b<BOCU1_START_POS_3) {
- /* two bytes */
- diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
- count=1;
- } else if(b<BOCU1_START_POS_4) {
- /* three bytes */
- diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
- count=2;
- } else {
- /* four bytes */
- diff=BOCU1_REACH_POS_3+1;
- count=3;
- }
- } else {
- /* negative difference */
- if(b>=BOCU1_START_NEG_3) {
- /* two bytes */
- diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
- count=1;
- } else if(b>BOCU1_MIN) {
- /* three bytes */
- diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
- count=2;
- } else {
- /* four bytes */
- diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
- count=3;
- }
- }
- /* return the state for decoding the trail byte(s) */
- return ((uint32_t)diff<<2)|count;
- }
- /**
- * Function for BOCU-1 decoder; handles multi-byte trail bytes.
- *
- * @param count number of remaining trail bytes including this one
- * @param b trail byte
- * @return new delta for diff including b - <0 indicates an error
- *
- * @see decodeBocu1
- */
- static inline int32_t
- decodeBocu1TrailByte(int32_t count, int32_t b) {
- if(b<=0x20) {
- /* skip some C0 controls and make the trail byte range contiguous */
- b=bocu1ByteToTrail[b];
- /* b<0 for an illegal trail byte value will result in return<0 below */
- #if BOCU1_MAX_TRAIL<0xff
- } else if(b>BOCU1_MAX_TRAIL) {
- return -99;
- #endif
- } else {
- b-=BOCU1_TRAIL_BYTE_OFFSET;
- }
- /* add trail byte into difference and decrement count */
- if(count==1) {
- return b;
- } else if(count==2) {
- return b*BOCU1_TRAIL_COUNT;
- } else /* count==3 */ {
- return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
- }
- }
- static void U_CALLCONV
- _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- UConverter *cnv;
- const uint8_t *source, *sourceLimit;
- char16_t *target;
- const char16_t *targetLimit;
- int32_t *offsets;
- int32_t prev, count, diff, c;
- int8_t byteIndex;
- uint8_t *bytes;
- int32_t sourceIndex, nextSourceIndex;
- /* set up the local pointers */
- cnv=pArgs->converter;
- source=(const uint8_t *)pArgs->source;
- sourceLimit=(const uint8_t *)pArgs->sourceLimit;
- target=pArgs->target;
- targetLimit=pArgs->targetLimit;
- offsets=pArgs->offsets;
- /* get the converter state from UConverter */
- prev=(int32_t)cnv->toUnicodeStatus;
- if(prev==0) {
- prev=BOCU1_ASCII_PREV;
- }
- diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
- count=diff&3;
- diff>>=2;
- byteIndex=cnv->toULength;
- bytes=cnv->toUBytes;
- /* sourceIndex=-1 if the current character began in the previous buffer */
- sourceIndex=byteIndex==0 ? 0 : -1;
- nextSourceIndex=0;
- /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
- if(count>0 && byteIndex>0 && target<targetLimit) {
- goto getTrail;
- }
- fastSingle:
- /* fast loop for single-byte differences */
- /* use count as the only loop counter variable */
- diff=(int32_t)(sourceLimit-source);
- count=(int32_t)(pArgs->targetLimit-target);
- if(count>diff) {
- count=diff;
- }
- while(count>0) {
- if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
- c=prev+(c-BOCU1_MIDDLE);
- if(c<0x3000) {
- *target++=(char16_t)c;
- *offsets++=nextSourceIndex++;
- prev=BOCU1_SIMPLE_PREV(c);
- } else {
- break;
- }
- } else if(c<=0x20) {
- if(c!=0x20) {
- prev=BOCU1_ASCII_PREV;
- }
- *target++=(char16_t)c;
- *offsets++=nextSourceIndex++;
- } else {
- break;
- }
- ++source;
- --count;
- }
- sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
- /* decode a sequence of single and lead bytes */
- while(source<sourceLimit) {
- if(target>=targetLimit) {
- /* target is full */
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- ++nextSourceIndex;
- c=*source++;
- if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
- /* Write a code point directly from a single-byte difference. */
- c=prev+(c-BOCU1_MIDDLE);
- if(c<0x3000) {
- *target++=(char16_t)c;
- *offsets++=sourceIndex;
- prev=BOCU1_SIMPLE_PREV(c);
- sourceIndex=nextSourceIndex;
- goto fastSingle;
- }
- } else if(c<=0x20) {
- /*
- * Direct-encoded C0 control code or space.
- * Reset prev for C0 control codes but not for space.
- */
- if(c!=0x20) {
- prev=BOCU1_ASCII_PREV;
- }
- *target++=(char16_t)c;
- *offsets++=sourceIndex;
- sourceIndex=nextSourceIndex;
- continue;
- } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
- /* Optimize two-byte case. */
- if(c>=BOCU1_MIDDLE) {
- diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
- } else {
- diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
- }
- /* trail byte */
- ++nextSourceIndex;
- c=decodeBocu1TrailByte(1, *source++);
- if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
- bytes[0]=source[-2];
- bytes[1]=source[-1];
- byteIndex=2;
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- break;
- }
- } else if(c==BOCU1_RESET) {
- /* only reset the state, no code point */
- prev=BOCU1_ASCII_PREV;
- sourceIndex=nextSourceIndex;
- continue;
- } else {
- /*
- * For multi-byte difference lead bytes, set the decoder state
- * with the partial difference value from the lead byte and
- * with the number of trail bytes.
- */
- bytes[0]=(uint8_t)c;
- byteIndex=1;
- diff=decodeBocu1LeadByte(c);
- count=diff&3;
- diff>>=2;
- getTrail:
- for(;;) {
- if(source>=sourceLimit) {
- goto endloop;
- }
- ++nextSourceIndex;
- c=bytes[byteIndex++]=*source++;
- /* trail byte in any position */
- c=decodeBocu1TrailByte(count, c);
- if(c<0) {
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- goto endloop;
- }
- diff+=c;
- if(--count==0) {
- /* final trail byte, deliver a code point */
- byteIndex=0;
- c=prev+diff;
- if((uint32_t)c>0x10ffff) {
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- goto endloop;
- }
- break;
- }
- }
- }
- /* calculate the next prev and output c */
- prev=BOCU1_PREV(c);
- if(c<=0xffff) {
- *target++=(char16_t)c;
- *offsets++=sourceIndex;
- } else {
- /* output surrogate pair */
- *target++=U16_LEAD(c);
- if(target<targetLimit) {
- *target++=U16_TRAIL(c);
- *offsets++=sourceIndex;
- *offsets++=sourceIndex;
- } else {
- /* target overflow */
- *offsets++=sourceIndex;
- cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
- cnv->UCharErrorBufferLength=1;
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }
- sourceIndex=nextSourceIndex;
- }
- endloop:
- if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
- /* set the converter state in UConverter to deal with the next character */
- cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
- cnv->mode=0;
- } else {
- /* set the converter state back into UConverter */
- cnv->toUnicodeStatus=(uint32_t)prev;
- cnv->mode=(int32_t)((uint32_t)diff<<2)|count;
- }
- cnv->toULength=byteIndex;
- /* write back the updated pointers */
- pArgs->source=(const char *)source;
- pArgs->target=target;
- pArgs->offsets=offsets;
- return;
- }
- /*
- * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
- * If a change is made in the original function, then either
- * change this function the same way or
- * re-copy the original function and remove the variables
- * offsets, sourceIndex, and nextSourceIndex.
- */
- static void U_CALLCONV
- _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- UConverter *cnv;
- const uint8_t *source, *sourceLimit;
- char16_t *target;
- const char16_t *targetLimit;
- int32_t prev, count, diff, c;
- int8_t byteIndex;
- uint8_t *bytes;
- /* set up the local pointers */
- cnv=pArgs->converter;
- source=(const uint8_t *)pArgs->source;
- sourceLimit=(const uint8_t *)pArgs->sourceLimit;
- target=pArgs->target;
- targetLimit=pArgs->targetLimit;
- /* get the converter state from UConverter */
- prev=(int32_t)cnv->toUnicodeStatus;
- if(prev==0) {
- prev=BOCU1_ASCII_PREV;
- }
- diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
- count=diff&3;
- diff>>=2;
- byteIndex=cnv->toULength;
- bytes=cnv->toUBytes;
- /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
- if(count>0 && byteIndex>0 && target<targetLimit) {
- goto getTrail;
- }
- fastSingle:
- /* fast loop for single-byte differences */
- /* use count as the only loop counter variable */
- diff=(int32_t)(sourceLimit-source);
- count=(int32_t)(pArgs->targetLimit-target);
- if(count>diff) {
- count=diff;
- }
- while(count>0) {
- if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
- c=prev+(c-BOCU1_MIDDLE);
- if(c<0x3000) {
- *target++=(char16_t)c;
- prev=BOCU1_SIMPLE_PREV(c);
- } else {
- break;
- }
- } else if(c<=0x20) {
- if(c!=0x20) {
- prev=BOCU1_ASCII_PREV;
- }
- *target++=(char16_t)c;
- } else {
- break;
- }
- ++source;
- --count;
- }
- /* decode a sequence of single and lead bytes */
- while(source<sourceLimit) {
- if(target>=targetLimit) {
- /* target is full */
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- c=*source++;
- if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
- /* Write a code point directly from a single-byte difference. */
- c=prev+(c-BOCU1_MIDDLE);
- if(c<0x3000) {
- *target++=(char16_t)c;
- prev=BOCU1_SIMPLE_PREV(c);
- goto fastSingle;
- }
- } else if(c<=0x20) {
- /*
- * Direct-encoded C0 control code or space.
- * Reset prev for C0 control codes but not for space.
- */
- if(c!=0x20) {
- prev=BOCU1_ASCII_PREV;
- }
- *target++=(char16_t)c;
- continue;
- } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
- /* Optimize two-byte case. */
- if(c>=BOCU1_MIDDLE) {
- diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
- } else {
- diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
- }
- /* trail byte */
- c=decodeBocu1TrailByte(1, *source++);
- if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
- bytes[0]=source[-2];
- bytes[1]=source[-1];
- byteIndex=2;
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- break;
- }
- } else if(c==BOCU1_RESET) {
- /* only reset the state, no code point */
- prev=BOCU1_ASCII_PREV;
- continue;
- } else {
- /*
- * For multi-byte difference lead bytes, set the decoder state
- * with the partial difference value from the lead byte and
- * with the number of trail bytes.
- */
- bytes[0]=(uint8_t)c;
- byteIndex=1;
- diff=decodeBocu1LeadByte(c);
- count=diff&3;
- diff>>=2;
- getTrail:
- for(;;) {
- if(source>=sourceLimit) {
- goto endloop;
- }
- c=bytes[byteIndex++]=*source++;
- /* trail byte in any position */
- c=decodeBocu1TrailByte(count, c);
- if(c<0) {
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- goto endloop;
- }
- diff+=c;
- if(--count==0) {
- /* final trail byte, deliver a code point */
- byteIndex=0;
- c=prev+diff;
- if((uint32_t)c>0x10ffff) {
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- goto endloop;
- }
- break;
- }
- }
- }
- /* calculate the next prev and output c */
- prev=BOCU1_PREV(c);
- if(c<=0xffff) {
- *target++=(char16_t)c;
- } else {
- /* output surrogate pair */
- *target++=U16_LEAD(c);
- if(target<targetLimit) {
- *target++=U16_TRAIL(c);
- } else {
- /* target overflow */
- cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
- cnv->UCharErrorBufferLength=1;
- *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
- break;
- }
- }
- }
- endloop:
- if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
- /* set the converter state in UConverter to deal with the next character */
- cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
- cnv->mode=0;
- } else {
- /* set the converter state back into UConverter */
- cnv->toUnicodeStatus=(uint32_t)prev;
- cnv->mode=((uint32_t)diff<<2)|count;
- }
- cnv->toULength=byteIndex;
- /* write back the updated pointers */
- pArgs->source=(const char *)source;
- pArgs->target=target;
- return;
- }
- /* miscellaneous ------------------------------------------------------------ */
- static const UConverterImpl _Bocu1Impl={
- UCNV_BOCU1,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- _Bocu1ToUnicode,
- _Bocu1ToUnicodeWithOffsets,
- _Bocu1FromUnicode,
- _Bocu1FromUnicodeWithOffsets,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- nullptr,
- ucnv_getCompleteUnicodeSet,
- nullptr,
- nullptr
- };
- static const UConverterStaticData _Bocu1StaticData={
- sizeof(UConverterStaticData),
- "BOCU-1",
- 1214, /* CCSID for BOCU-1 */
- UCNV_IBM, UCNV_BOCU1,
- 1, 4, /* one char16_t generates at least 1 byte and at most 4 bytes */
- { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
- false, false,
- 0,
- 0,
- { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
- };
- const UConverterSharedData _Bocu1Data=
- UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
- #endif
|