ucnvbocu.cpp 45 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2002-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: ucnvbocu.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002mar27
  16. * created by: Markus W. Scherer
  17. *
  18. * This is an implementation of the Binary Ordered Compression for Unicode,
  19. * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
  20. */
  21. #include "unicode/utypes.h"
  22. #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  23. #include "unicode/ucnv.h"
  24. #include "unicode/ucnv_cb.h"
  25. #include "unicode/utf16.h"
  26. #include "putilimp.h"
  27. #include "ucnv_bld.h"
  28. #include "ucnv_cnv.h"
  29. #include "uassert.h"
  30. /* BOCU-1 constants and macros ---------------------------------------------- */
  31. /*
  32. * BOCU-1 encodes the code points of a Unicode string as
  33. * a sequence of byte-encoded differences (slope detection),
  34. * preserving lexical order.
  35. *
  36. * Optimize the difference-taking for runs of Unicode text within
  37. * small scripts:
  38. *
  39. * Most small scripts are allocated within aligned 128-blocks of Unicode
  40. * code points. Lexical order is preserved if the "previous code point" state
  41. * is always moved into the middle of such a block.
  42. *
  43. * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
  44. * areas into the middle of those areas.
  45. *
  46. * C0 control codes and space are encoded with their US-ASCII bytes.
  47. * "prev" is reset for C0 controls but not for space.
  48. */
  49. /* initial value for "prev": middle of the ASCII range */
  50. #define BOCU1_ASCII_PREV 0x40
  51. /* bounding byte values for differences */
  52. #define BOCU1_MIN 0x21
  53. #define BOCU1_MIDDLE 0x90
  54. #define BOCU1_MAX_LEAD 0xfe
  55. #define BOCU1_MAX_TRAIL 0xff
  56. #define BOCU1_RESET 0xff
  57. /* number of lead bytes */
  58. #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1)
  59. /* adjust trail byte counts for the use of some C0 control byte values */
  60. #define BOCU1_TRAIL_CONTROLS_COUNT 20
  61. #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
  62. /* number of trail bytes */
  63. #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
  64. /*
  65. * number of positive and negative single-byte codes
  66. * (counting 0==BOCU1_MIDDLE among the positive ones)
  67. */
  68. #define BOCU1_SINGLE 64
  69. /* number of lead bytes for positive and negative 2/3/4-byte sequences */
  70. #define BOCU1_LEAD_2 43
  71. #define BOCU1_LEAD_3 3
  72. #define BOCU1_LEAD_4 1
  73. /* The difference value range for single-byters. */
  74. #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1)
  75. #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE)
  76. /* The difference value range for double-byters. */
  77. #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  78. #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
  79. /* The difference value range for 3-byters. */
  80. #define BOCU1_REACH_POS_3 \
  81. (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  82. #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
  83. /* The lead byte start values. */
  84. #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
  85. #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2)
  86. #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3)
  87. /* ==BOCU1_MAX_LEAD */
  88. #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
  89. #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2)
  90. #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3)
  91. /* ==BOCU1_MIN+1 */
  92. /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
  93. #define BOCU1_LENGTH_FROM_LEAD(lead) \
  94. ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
  95. (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
  96. (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
  97. /* The length of a byte sequence, according to its packed form. */
  98. #define BOCU1_LENGTH_FROM_PACKED(packed) \
  99. ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
  100. /*
  101. * 12 commonly used C0 control codes (and space) are only used to encode
  102. * themselves directly,
  103. * which makes BOCU-1 MIME-usable and reasonably safe for
  104. * ASCII-oriented software.
  105. *
  106. * These controls are
  107. * 0 NUL
  108. *
  109. * 7 BEL
  110. * 8 BS
  111. *
  112. * 9 TAB
  113. * a LF
  114. * b VT
  115. * c FF
  116. * d CR
  117. *
  118. * e SO
  119. * f SI
  120. *
  121. * 1a SUB
  122. * 1b ESC
  123. *
  124. * The other 20 C0 controls are also encoded directly (to preserve order)
  125. * but are also used as trail bytes in difference encoding
  126. * (for better compression).
  127. */
  128. #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
  129. /*
  130. * Byte value map for control codes,
  131. * from external byte values 0x00..0x20
  132. * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
  133. * External byte values that are illegal as trail bytes are mapped to -1.
  134. */
  135. static const int8_t
  136. bocu1ByteToTrail[BOCU1_MIN]={
  137. /* 0 1 2 3 4 5 6 7 */
  138. -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
  139. /* 8 9 a b c d e f */
  140. -1, -1, -1, -1, -1, -1, -1, -1,
  141. /* 10 11 12 13 14 15 16 17 */
  142. 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
  143. /* 18 19 1a 1b 1c 1d 1e 1f */
  144. 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13,
  145. /* 20 */
  146. -1
  147. };
  148. /*
  149. * Byte value map for control codes,
  150. * from trail byte values 0..19 (0..0x13) as used in the difference calculation
  151. * to external byte values 0x00..0x20.
  152. */
  153. static const int8_t
  154. bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
  155. /* 0 1 2 3 4 5 6 7 */
  156. 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
  157. /* 8 9 a b c d e f */
  158. 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
  159. /* 10 11 12 13 */
  160. 0x1c, 0x1d, 0x1e, 0x1f
  161. };
  162. /**
  163. * Integer division and modulo with negative numerators
  164. * yields negative modulo results and quotients that are one more than
  165. * what we need here.
  166. * This macro adjust the results so that the modulo-value m is always >=0.
  167. *
  168. * For positive n, the if() condition is always false.
  169. *
  170. * @param n Number to be split into quotient and rest.
  171. * Will be modified to contain the quotient.
  172. * @param d Divisor.
  173. * @param m Output variable for the rest (modulo result).
  174. */
  175. #define NEGDIVMOD(n, d, m) UPRV_BLOCK_MACRO_BEGIN { \
  176. (m)=(n)%(d); \
  177. (n)/=(d); \
  178. if((m)<0) { \
  179. --(n); \
  180. (m)+=(d); \
  181. } \
  182. } UPRV_BLOCK_MACRO_END
  183. /* Faster versions of packDiff() for single-byte-encoded diff values. */
  184. /** Is a diff value encodable in a single byte? */
  185. #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
  186. /** Encode a diff value in a single byte. */
  187. #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
  188. /** Is a diff value encodable in two bytes? */
  189. #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
  190. /* BOCU-1 implementation functions ------------------------------------------ */
  191. #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
  192. /**
  193. * Compute the next "previous" value for differencing
  194. * from the current code point.
  195. *
  196. * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
  197. * @return "previous code point" state value
  198. */
  199. static inline int32_t
  200. bocu1Prev(int32_t c) {
  201. /* compute new prev */
  202. if(/* 0x3040<=c && */ c<=0x309f) {
  203. /* Hiragana is not 128-aligned */
  204. return 0x3070;
  205. } else if(0x4e00<=c && c<=0x9fa5) {
  206. /* CJK Unihan */
  207. return 0x4e00-BOCU1_REACH_NEG_2;
  208. } else if(0xac00<=c /* && c<=0xd7a3 */) {
  209. /* Korean Hangul */
  210. return (0xd7a3+0xac00)/2;
  211. } else {
  212. /* mostly small scripts */
  213. return BOCU1_SIMPLE_PREV(c);
  214. }
  215. }
  216. /** Fast version of bocu1Prev() for most scripts. */
  217. #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
  218. /*
  219. * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
  220. * The UConverter fields are used as follows:
  221. *
  222. * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
  223. *
  224. * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
  225. * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0)
  226. */
  227. /* BOCU-1-from-Unicode conversion functions --------------------------------- */
  228. /**
  229. * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
  230. * and return a packed integer with them.
  231. *
  232. * The encoding favors small absolute differences with short encodings
  233. * to compress runs of same-script characters.
  234. *
  235. * Optimized version with unrolled loops and fewer floating-point operations
  236. * than the standard packDiff().
  237. *
  238. * @param diff difference value -0x10ffff..0x10ffff
  239. * @return
  240. * 0x010000zz for 1-byte sequence zz
  241. * 0x0200yyzz for 2-byte sequence yy zz
  242. * 0x03xxyyzz for 3-byte sequence xx yy zz
  243. * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
  244. */
  245. static int32_t
  246. packDiff(int32_t diff) {
  247. int32_t result, m;
  248. U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
  249. if(diff>=BOCU1_REACH_NEG_1) {
  250. /* mostly positive differences, and single-byte negative ones */
  251. #if 0 /* single-byte case handled in macros, see below */
  252. if(diff<=BOCU1_REACH_POS_1) {
  253. /* single byte */
  254. return 0x01000000|(BOCU1_MIDDLE+diff);
  255. } else
  256. #endif
  257. if(diff<=BOCU1_REACH_POS_2) {
  258. /* two bytes */
  259. diff-=BOCU1_REACH_POS_1+1;
  260. result=0x02000000;
  261. m=diff%BOCU1_TRAIL_COUNT;
  262. diff/=BOCU1_TRAIL_COUNT;
  263. result|=BOCU1_TRAIL_TO_BYTE(m);
  264. result|=(BOCU1_START_POS_2+diff)<<8;
  265. } else if(diff<=BOCU1_REACH_POS_3) {
  266. /* three bytes */
  267. diff-=BOCU1_REACH_POS_2+1;
  268. result=0x03000000;
  269. m=diff%BOCU1_TRAIL_COUNT;
  270. diff/=BOCU1_TRAIL_COUNT;
  271. result|=BOCU1_TRAIL_TO_BYTE(m);
  272. m=diff%BOCU1_TRAIL_COUNT;
  273. diff/=BOCU1_TRAIL_COUNT;
  274. result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
  275. result|=(BOCU1_START_POS_3+diff)<<16;
  276. } else {
  277. /* four bytes */
  278. diff-=BOCU1_REACH_POS_3+1;
  279. m=diff%BOCU1_TRAIL_COUNT;
  280. diff/=BOCU1_TRAIL_COUNT;
  281. result=BOCU1_TRAIL_TO_BYTE(m);
  282. m=diff%BOCU1_TRAIL_COUNT;
  283. diff/=BOCU1_TRAIL_COUNT;
  284. result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
  285. /*
  286. * We know that / and % would deliver quotient 0 and rest=diff.
  287. * Avoid division and modulo for performance.
  288. */
  289. result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
  290. result|=((uint32_t)BOCU1_START_POS_4)<<24;
  291. }
  292. } else {
  293. /* two- to four-byte negative differences */
  294. if(diff>=BOCU1_REACH_NEG_2) {
  295. /* two bytes */
  296. diff-=BOCU1_REACH_NEG_1;
  297. result=0x02000000;
  298. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  299. result|=BOCU1_TRAIL_TO_BYTE(m);
  300. result|=(BOCU1_START_NEG_2+diff)<<8;
  301. } else if(diff>=BOCU1_REACH_NEG_3) {
  302. /* three bytes */
  303. diff-=BOCU1_REACH_NEG_2;
  304. result=0x03000000;
  305. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  306. result|=BOCU1_TRAIL_TO_BYTE(m);
  307. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  308. result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
  309. result|=(BOCU1_START_NEG_3+diff)<<16;
  310. } else {
  311. /* four bytes */
  312. diff-=BOCU1_REACH_NEG_3;
  313. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  314. result=BOCU1_TRAIL_TO_BYTE(m);
  315. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  316. result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
  317. /*
  318. * We know that NEGDIVMOD would deliver
  319. * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
  320. * Avoid division and modulo for performance.
  321. */
  322. m=diff+BOCU1_TRAIL_COUNT;
  323. result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
  324. result|=BOCU1_MIN<<24;
  325. }
  326. }
  327. return result;
  328. }
  329. static void U_CALLCONV
  330. _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  331. UErrorCode *pErrorCode) {
  332. UConverter *cnv;
  333. const char16_t *source, *sourceLimit;
  334. uint8_t *target;
  335. int32_t targetCapacity;
  336. int32_t *offsets;
  337. int32_t prev, c, diff;
  338. int32_t sourceIndex, nextSourceIndex;
  339. /* set up the local pointers */
  340. cnv=pArgs->converter;
  341. source=pArgs->source;
  342. sourceLimit=pArgs->sourceLimit;
  343. target=(uint8_t *)pArgs->target;
  344. targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
  345. offsets=pArgs->offsets;
  346. /* get the converter state from UConverter */
  347. c=cnv->fromUChar32;
  348. prev=(int32_t)cnv->fromUnicodeStatus;
  349. if(prev==0) {
  350. prev=BOCU1_ASCII_PREV;
  351. }
  352. /* sourceIndex=-1 if the current character began in the previous buffer */
  353. sourceIndex= c==0 ? 0 : -1;
  354. nextSourceIndex=0;
  355. /* conversion loop */
  356. if(c!=0 && targetCapacity>0) {
  357. goto getTrail;
  358. }
  359. fastSingle:
  360. /* fast loop for single-byte differences */
  361. /* use only one loop counter variable, targetCapacity, not also source */
  362. diff=(int32_t)(sourceLimit-source);
  363. if(targetCapacity>diff) {
  364. targetCapacity=diff;
  365. }
  366. while(targetCapacity>0 && (c=*source)<0x3000) {
  367. if(c<=0x20) {
  368. if(c!=0x20) {
  369. prev=BOCU1_ASCII_PREV;
  370. }
  371. *target++=(uint8_t)c;
  372. *offsets++=nextSourceIndex++;
  373. ++source;
  374. --targetCapacity;
  375. } else {
  376. diff=c-prev;
  377. if(DIFF_IS_SINGLE(diff)) {
  378. prev=BOCU1_SIMPLE_PREV(c);
  379. *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
  380. *offsets++=nextSourceIndex++;
  381. ++source;
  382. --targetCapacity;
  383. } else {
  384. break;
  385. }
  386. }
  387. }
  388. /* restore real values */
  389. targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
  390. sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
  391. /* regular loop for all cases */
  392. while(source<sourceLimit) {
  393. if(targetCapacity>0) {
  394. c=*source++;
  395. ++nextSourceIndex;
  396. if(c<=0x20) {
  397. /*
  398. * ISO C0 control & space:
  399. * Encode directly for MIME compatibility,
  400. * and reset state except for space, to not disrupt compression.
  401. */
  402. if(c!=0x20) {
  403. prev=BOCU1_ASCII_PREV;
  404. }
  405. *target++=(uint8_t)c;
  406. *offsets++=sourceIndex;
  407. --targetCapacity;
  408. sourceIndex=nextSourceIndex;
  409. continue;
  410. }
  411. if(U16_IS_LEAD(c)) {
  412. getTrail:
  413. if(source<sourceLimit) {
  414. /* test the following code unit */
  415. char16_t trail=*source;
  416. if(U16_IS_TRAIL(trail)) {
  417. ++source;
  418. ++nextSourceIndex;
  419. c=U16_GET_SUPPLEMENTARY(c, trail);
  420. }
  421. } else {
  422. /* no more input */
  423. c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
  424. break;
  425. }
  426. }
  427. /*
  428. * all other Unicode code points c==U+0021..U+10ffff
  429. * are encoded with the difference c-prev
  430. *
  431. * a new prev is computed from c,
  432. * placed in the middle of a 0x80-block (for most small scripts) or
  433. * in the middle of the Unihan and Hangul blocks
  434. * to statistically minimize the following difference
  435. */
  436. diff=c-prev;
  437. prev=BOCU1_PREV(c);
  438. if(DIFF_IS_SINGLE(diff)) {
  439. *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
  440. *offsets++=sourceIndex;
  441. --targetCapacity;
  442. sourceIndex=nextSourceIndex;
  443. if(c<0x3000) {
  444. goto fastSingle;
  445. }
  446. } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
  447. /* optimize 2-byte case */
  448. int32_t m;
  449. if(diff>=0) {
  450. diff-=BOCU1_REACH_POS_1+1;
  451. m=diff%BOCU1_TRAIL_COUNT;
  452. diff/=BOCU1_TRAIL_COUNT;
  453. diff+=BOCU1_START_POS_2;
  454. } else {
  455. diff-=BOCU1_REACH_NEG_1;
  456. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  457. diff+=BOCU1_START_NEG_2;
  458. }
  459. *target++=(uint8_t)diff;
  460. *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
  461. *offsets++=sourceIndex;
  462. *offsets++=sourceIndex;
  463. targetCapacity-=2;
  464. sourceIndex=nextSourceIndex;
  465. } else {
  466. int32_t length; /* will be 2..4 */
  467. diff=packDiff(diff);
  468. length=BOCU1_LENGTH_FROM_PACKED(diff);
  469. /* write the output character bytes from diff and length */
  470. /* from the first if in the loop we know that targetCapacity>0 */
  471. if(length<=targetCapacity) {
  472. switch(length) {
  473. /* each branch falls through to the next one */
  474. case 4:
  475. *target++=(uint8_t)(diff>>24);
  476. *offsets++=sourceIndex;
  477. U_FALLTHROUGH;
  478. case 3:
  479. *target++=(uint8_t)(diff>>16);
  480. *offsets++=sourceIndex;
  481. U_FALLTHROUGH;
  482. case 2:
  483. *target++=(uint8_t)(diff>>8);
  484. *offsets++=sourceIndex;
  485. /* case 1: handled above */
  486. *target++=(uint8_t)diff;
  487. *offsets++=sourceIndex;
  488. U_FALLTHROUGH;
  489. default:
  490. /* will never occur */
  491. break;
  492. }
  493. targetCapacity-=length;
  494. sourceIndex=nextSourceIndex;
  495. } else {
  496. uint8_t *charErrorBuffer;
  497. /*
  498. * We actually do this backwards here:
  499. * In order to save an intermediate variable, we output
  500. * first to the overflow buffer what does not fit into the
  501. * regular target.
  502. */
  503. /* we know that 1<=targetCapacity<length<=4 */
  504. length-=targetCapacity;
  505. charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
  506. switch(length) {
  507. /* each branch falls through to the next one */
  508. case 3:
  509. *charErrorBuffer++=(uint8_t)(diff>>16);
  510. U_FALLTHROUGH;
  511. case 2:
  512. *charErrorBuffer++=(uint8_t)(diff>>8);
  513. U_FALLTHROUGH;
  514. case 1:
  515. *charErrorBuffer=(uint8_t)diff;
  516. U_FALLTHROUGH;
  517. default:
  518. /* will never occur */
  519. break;
  520. }
  521. cnv->charErrorBufferLength=(int8_t)length;
  522. /* now output what fits into the regular target */
  523. diff>>=8*length; /* length was reduced by targetCapacity */
  524. switch(targetCapacity) {
  525. /* each branch falls through to the next one */
  526. case 3:
  527. *target++=(uint8_t)(diff>>16);
  528. *offsets++=sourceIndex;
  529. U_FALLTHROUGH;
  530. case 2:
  531. *target++=(uint8_t)(diff>>8);
  532. *offsets++=sourceIndex;
  533. U_FALLTHROUGH;
  534. case 1:
  535. *target++=(uint8_t)diff;
  536. *offsets++=sourceIndex;
  537. U_FALLTHROUGH;
  538. default:
  539. /* will never occur */
  540. break;
  541. }
  542. /* target overflow */
  543. targetCapacity=0;
  544. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  545. break;
  546. }
  547. }
  548. } else {
  549. /* target is full */
  550. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  551. break;
  552. }
  553. }
  554. /* set the converter state back into UConverter */
  555. cnv->fromUChar32= c<0 ? -c : 0;
  556. cnv->fromUnicodeStatus=(uint32_t)prev;
  557. /* write back the updated pointers */
  558. pArgs->source=source;
  559. pArgs->target=(char *)target;
  560. pArgs->offsets=offsets;
  561. }
  562. /*
  563. * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
  564. * If a change is made in the original function, then either
  565. * change this function the same way or
  566. * re-copy the original function and remove the variables
  567. * offsets, sourceIndex, and nextSourceIndex.
  568. */
  569. static void U_CALLCONV
  570. _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
  571. UErrorCode *pErrorCode) {
  572. UConverter *cnv;
  573. const char16_t *source, *sourceLimit;
  574. uint8_t *target;
  575. int32_t targetCapacity;
  576. int32_t prev, c, diff;
  577. /* set up the local pointers */
  578. cnv=pArgs->converter;
  579. source=pArgs->source;
  580. sourceLimit=pArgs->sourceLimit;
  581. target=(uint8_t *)pArgs->target;
  582. targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
  583. /* get the converter state from UConverter */
  584. c=cnv->fromUChar32;
  585. prev=(int32_t)cnv->fromUnicodeStatus;
  586. if(prev==0) {
  587. prev=BOCU1_ASCII_PREV;
  588. }
  589. /* conversion loop */
  590. if(c!=0 && targetCapacity>0) {
  591. goto getTrail;
  592. }
  593. fastSingle:
  594. /* fast loop for single-byte differences */
  595. /* use only one loop counter variable, targetCapacity, not also source */
  596. diff=(int32_t)(sourceLimit-source);
  597. if(targetCapacity>diff) {
  598. targetCapacity=diff;
  599. }
  600. while(targetCapacity>0 && (c=*source)<0x3000) {
  601. if(c<=0x20) {
  602. if(c!=0x20) {
  603. prev=BOCU1_ASCII_PREV;
  604. }
  605. *target++=(uint8_t)c;
  606. } else {
  607. diff=c-prev;
  608. if(DIFF_IS_SINGLE(diff)) {
  609. prev=BOCU1_SIMPLE_PREV(c);
  610. *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
  611. } else {
  612. break;
  613. }
  614. }
  615. ++source;
  616. --targetCapacity;
  617. }
  618. /* restore real values */
  619. targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
  620. /* regular loop for all cases */
  621. while(source<sourceLimit) {
  622. if(targetCapacity>0) {
  623. c=*source++;
  624. if(c<=0x20) {
  625. /*
  626. * ISO C0 control & space:
  627. * Encode directly for MIME compatibility,
  628. * and reset state except for space, to not disrupt compression.
  629. */
  630. if(c!=0x20) {
  631. prev=BOCU1_ASCII_PREV;
  632. }
  633. *target++=(uint8_t)c;
  634. --targetCapacity;
  635. continue;
  636. }
  637. if(U16_IS_LEAD(c)) {
  638. getTrail:
  639. if(source<sourceLimit) {
  640. /* test the following code unit */
  641. char16_t trail=*source;
  642. if(U16_IS_TRAIL(trail)) {
  643. ++source;
  644. c=U16_GET_SUPPLEMENTARY(c, trail);
  645. }
  646. } else {
  647. /* no more input */
  648. c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
  649. break;
  650. }
  651. }
  652. /*
  653. * all other Unicode code points c==U+0021..U+10ffff
  654. * are encoded with the difference c-prev
  655. *
  656. * a new prev is computed from c,
  657. * placed in the middle of a 0x80-block (for most small scripts) or
  658. * in the middle of the Unihan and Hangul blocks
  659. * to statistically minimize the following difference
  660. */
  661. diff=c-prev;
  662. prev=BOCU1_PREV(c);
  663. if(DIFF_IS_SINGLE(diff)) {
  664. *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
  665. --targetCapacity;
  666. if(c<0x3000) {
  667. goto fastSingle;
  668. }
  669. } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
  670. /* optimize 2-byte case */
  671. int32_t m;
  672. if(diff>=0) {
  673. diff-=BOCU1_REACH_POS_1+1;
  674. m=diff%BOCU1_TRAIL_COUNT;
  675. diff/=BOCU1_TRAIL_COUNT;
  676. diff+=BOCU1_START_POS_2;
  677. } else {
  678. diff-=BOCU1_REACH_NEG_1;
  679. NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
  680. diff+=BOCU1_START_NEG_2;
  681. }
  682. *target++=(uint8_t)diff;
  683. *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
  684. targetCapacity-=2;
  685. } else {
  686. int32_t length; /* will be 2..4 */
  687. diff=packDiff(diff);
  688. length=BOCU1_LENGTH_FROM_PACKED(diff);
  689. /* write the output character bytes from diff and length */
  690. /* from the first if in the loop we know that targetCapacity>0 */
  691. if(length<=targetCapacity) {
  692. switch(length) {
  693. /* each branch falls through to the next one */
  694. case 4:
  695. *target++=(uint8_t)(diff>>24);
  696. U_FALLTHROUGH;
  697. case 3:
  698. *target++=(uint8_t)(diff>>16);
  699. /* case 2: handled above */
  700. *target++=(uint8_t)(diff>>8);
  701. /* case 1: handled above */
  702. *target++=(uint8_t)diff;
  703. U_FALLTHROUGH;
  704. default:
  705. /* will never occur */
  706. break;
  707. }
  708. targetCapacity-=length;
  709. } else {
  710. uint8_t *charErrorBuffer;
  711. /*
  712. * We actually do this backwards here:
  713. * In order to save an intermediate variable, we output
  714. * first to the overflow buffer what does not fit into the
  715. * regular target.
  716. */
  717. /* we know that 1<=targetCapacity<length<=4 */
  718. length-=targetCapacity;
  719. charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
  720. switch(length) {
  721. /* each branch falls through to the next one */
  722. case 3:
  723. *charErrorBuffer++=(uint8_t)(diff>>16);
  724. U_FALLTHROUGH;
  725. case 2:
  726. *charErrorBuffer++=(uint8_t)(diff>>8);
  727. U_FALLTHROUGH;
  728. case 1:
  729. *charErrorBuffer=(uint8_t)diff;
  730. U_FALLTHROUGH;
  731. default:
  732. /* will never occur */
  733. break;
  734. }
  735. cnv->charErrorBufferLength=(int8_t)length;
  736. /* now output what fits into the regular target */
  737. diff>>=8*length; /* length was reduced by targetCapacity */
  738. switch(targetCapacity) {
  739. /* each branch falls through to the next one */
  740. case 3:
  741. *target++=(uint8_t)(diff>>16);
  742. U_FALLTHROUGH;
  743. case 2:
  744. *target++=(uint8_t)(diff>>8);
  745. U_FALLTHROUGH;
  746. case 1:
  747. *target++=(uint8_t)diff;
  748. U_FALLTHROUGH;
  749. default:
  750. /* will never occur */
  751. break;
  752. }
  753. /* target overflow */
  754. targetCapacity=0;
  755. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  756. break;
  757. }
  758. }
  759. } else {
  760. /* target is full */
  761. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  762. break;
  763. }
  764. }
  765. /* set the converter state back into UConverter */
  766. cnv->fromUChar32= c<0 ? -c : 0;
  767. cnv->fromUnicodeStatus=(uint32_t)prev;
  768. /* write back the updated pointers */
  769. pArgs->source=source;
  770. pArgs->target=(char *)target;
  771. }
  772. /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
  773. /**
  774. * Function for BOCU-1 decoder; handles multi-byte lead bytes.
  775. *
  776. * @param b lead byte;
  777. * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
  778. * @return (diff<<2)|count
  779. */
  780. static inline int32_t
  781. decodeBocu1LeadByte(int32_t b) {
  782. int32_t diff, count;
  783. if(b>=BOCU1_START_NEG_2) {
  784. /* positive difference */
  785. if(b<BOCU1_START_POS_3) {
  786. /* two bytes */
  787. diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
  788. count=1;
  789. } else if(b<BOCU1_START_POS_4) {
  790. /* three bytes */
  791. diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
  792. count=2;
  793. } else {
  794. /* four bytes */
  795. diff=BOCU1_REACH_POS_3+1;
  796. count=3;
  797. }
  798. } else {
  799. /* negative difference */
  800. if(b>=BOCU1_START_NEG_3) {
  801. /* two bytes */
  802. diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
  803. count=1;
  804. } else if(b>BOCU1_MIN) {
  805. /* three bytes */
  806. diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
  807. count=2;
  808. } else {
  809. /* four bytes */
  810. diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
  811. count=3;
  812. }
  813. }
  814. /* return the state for decoding the trail byte(s) */
  815. return ((uint32_t)diff<<2)|count;
  816. }
  817. /**
  818. * Function for BOCU-1 decoder; handles multi-byte trail bytes.
  819. *
  820. * @param count number of remaining trail bytes including this one
  821. * @param b trail byte
  822. * @return new delta for diff including b - <0 indicates an error
  823. *
  824. * @see decodeBocu1
  825. */
  826. static inline int32_t
  827. decodeBocu1TrailByte(int32_t count, int32_t b) {
  828. if(b<=0x20) {
  829. /* skip some C0 controls and make the trail byte range contiguous */
  830. b=bocu1ByteToTrail[b];
  831. /* b<0 for an illegal trail byte value will result in return<0 below */
  832. #if BOCU1_MAX_TRAIL<0xff
  833. } else if(b>BOCU1_MAX_TRAIL) {
  834. return -99;
  835. #endif
  836. } else {
  837. b-=BOCU1_TRAIL_BYTE_OFFSET;
  838. }
  839. /* add trail byte into difference and decrement count */
  840. if(count==1) {
  841. return b;
  842. } else if(count==2) {
  843. return b*BOCU1_TRAIL_COUNT;
  844. } else /* count==3 */ {
  845. return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
  846. }
  847. }
  848. static void U_CALLCONV
  849. _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  850. UErrorCode *pErrorCode) {
  851. UConverter *cnv;
  852. const uint8_t *source, *sourceLimit;
  853. char16_t *target;
  854. const char16_t *targetLimit;
  855. int32_t *offsets;
  856. int32_t prev, count, diff, c;
  857. int8_t byteIndex;
  858. uint8_t *bytes;
  859. int32_t sourceIndex, nextSourceIndex;
  860. /* set up the local pointers */
  861. cnv=pArgs->converter;
  862. source=(const uint8_t *)pArgs->source;
  863. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  864. target=pArgs->target;
  865. targetLimit=pArgs->targetLimit;
  866. offsets=pArgs->offsets;
  867. /* get the converter state from UConverter */
  868. prev=(int32_t)cnv->toUnicodeStatus;
  869. if(prev==0) {
  870. prev=BOCU1_ASCII_PREV;
  871. }
  872. diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
  873. count=diff&3;
  874. diff>>=2;
  875. byteIndex=cnv->toULength;
  876. bytes=cnv->toUBytes;
  877. /* sourceIndex=-1 if the current character began in the previous buffer */
  878. sourceIndex=byteIndex==0 ? 0 : -1;
  879. nextSourceIndex=0;
  880. /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
  881. if(count>0 && byteIndex>0 && target<targetLimit) {
  882. goto getTrail;
  883. }
  884. fastSingle:
  885. /* fast loop for single-byte differences */
  886. /* use count as the only loop counter variable */
  887. diff=(int32_t)(sourceLimit-source);
  888. count=(int32_t)(pArgs->targetLimit-target);
  889. if(count>diff) {
  890. count=diff;
  891. }
  892. while(count>0) {
  893. if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
  894. c=prev+(c-BOCU1_MIDDLE);
  895. if(c<0x3000) {
  896. *target++=(char16_t)c;
  897. *offsets++=nextSourceIndex++;
  898. prev=BOCU1_SIMPLE_PREV(c);
  899. } else {
  900. break;
  901. }
  902. } else if(c<=0x20) {
  903. if(c!=0x20) {
  904. prev=BOCU1_ASCII_PREV;
  905. }
  906. *target++=(char16_t)c;
  907. *offsets++=nextSourceIndex++;
  908. } else {
  909. break;
  910. }
  911. ++source;
  912. --count;
  913. }
  914. sourceIndex=nextSourceIndex; /* wrong if offsets==nullptr but does not matter */
  915. /* decode a sequence of single and lead bytes */
  916. while(source<sourceLimit) {
  917. if(target>=targetLimit) {
  918. /* target is full */
  919. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  920. break;
  921. }
  922. ++nextSourceIndex;
  923. c=*source++;
  924. if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
  925. /* Write a code point directly from a single-byte difference. */
  926. c=prev+(c-BOCU1_MIDDLE);
  927. if(c<0x3000) {
  928. *target++=(char16_t)c;
  929. *offsets++=sourceIndex;
  930. prev=BOCU1_SIMPLE_PREV(c);
  931. sourceIndex=nextSourceIndex;
  932. goto fastSingle;
  933. }
  934. } else if(c<=0x20) {
  935. /*
  936. * Direct-encoded C0 control code or space.
  937. * Reset prev for C0 control codes but not for space.
  938. */
  939. if(c!=0x20) {
  940. prev=BOCU1_ASCII_PREV;
  941. }
  942. *target++=(char16_t)c;
  943. *offsets++=sourceIndex;
  944. sourceIndex=nextSourceIndex;
  945. continue;
  946. } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
  947. /* Optimize two-byte case. */
  948. if(c>=BOCU1_MIDDLE) {
  949. diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
  950. } else {
  951. diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
  952. }
  953. /* trail byte */
  954. ++nextSourceIndex;
  955. c=decodeBocu1TrailByte(1, *source++);
  956. if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
  957. bytes[0]=source[-2];
  958. bytes[1]=source[-1];
  959. byteIndex=2;
  960. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  961. break;
  962. }
  963. } else if(c==BOCU1_RESET) {
  964. /* only reset the state, no code point */
  965. prev=BOCU1_ASCII_PREV;
  966. sourceIndex=nextSourceIndex;
  967. continue;
  968. } else {
  969. /*
  970. * For multi-byte difference lead bytes, set the decoder state
  971. * with the partial difference value from the lead byte and
  972. * with the number of trail bytes.
  973. */
  974. bytes[0]=(uint8_t)c;
  975. byteIndex=1;
  976. diff=decodeBocu1LeadByte(c);
  977. count=diff&3;
  978. diff>>=2;
  979. getTrail:
  980. for(;;) {
  981. if(source>=sourceLimit) {
  982. goto endloop;
  983. }
  984. ++nextSourceIndex;
  985. c=bytes[byteIndex++]=*source++;
  986. /* trail byte in any position */
  987. c=decodeBocu1TrailByte(count, c);
  988. if(c<0) {
  989. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  990. goto endloop;
  991. }
  992. diff+=c;
  993. if(--count==0) {
  994. /* final trail byte, deliver a code point */
  995. byteIndex=0;
  996. c=prev+diff;
  997. if((uint32_t)c>0x10ffff) {
  998. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  999. goto endloop;
  1000. }
  1001. break;
  1002. }
  1003. }
  1004. }
  1005. /* calculate the next prev and output c */
  1006. prev=BOCU1_PREV(c);
  1007. if(c<=0xffff) {
  1008. *target++=(char16_t)c;
  1009. *offsets++=sourceIndex;
  1010. } else {
  1011. /* output surrogate pair */
  1012. *target++=U16_LEAD(c);
  1013. if(target<targetLimit) {
  1014. *target++=U16_TRAIL(c);
  1015. *offsets++=sourceIndex;
  1016. *offsets++=sourceIndex;
  1017. } else {
  1018. /* target overflow */
  1019. *offsets++=sourceIndex;
  1020. cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
  1021. cnv->UCharErrorBufferLength=1;
  1022. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1023. break;
  1024. }
  1025. }
  1026. sourceIndex=nextSourceIndex;
  1027. }
  1028. endloop:
  1029. if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
  1030. /* set the converter state in UConverter to deal with the next character */
  1031. cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
  1032. cnv->mode=0;
  1033. } else {
  1034. /* set the converter state back into UConverter */
  1035. cnv->toUnicodeStatus=(uint32_t)prev;
  1036. cnv->mode=(int32_t)((uint32_t)diff<<2)|count;
  1037. }
  1038. cnv->toULength=byteIndex;
  1039. /* write back the updated pointers */
  1040. pArgs->source=(const char *)source;
  1041. pArgs->target=target;
  1042. pArgs->offsets=offsets;
  1043. return;
  1044. }
  1045. /*
  1046. * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
  1047. * If a change is made in the original function, then either
  1048. * change this function the same way or
  1049. * re-copy the original function and remove the variables
  1050. * offsets, sourceIndex, and nextSourceIndex.
  1051. */
  1052. static void U_CALLCONV
  1053. _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
  1054. UErrorCode *pErrorCode) {
  1055. UConverter *cnv;
  1056. const uint8_t *source, *sourceLimit;
  1057. char16_t *target;
  1058. const char16_t *targetLimit;
  1059. int32_t prev, count, diff, c;
  1060. int8_t byteIndex;
  1061. uint8_t *bytes;
  1062. /* set up the local pointers */
  1063. cnv=pArgs->converter;
  1064. source=(const uint8_t *)pArgs->source;
  1065. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  1066. target=pArgs->target;
  1067. targetLimit=pArgs->targetLimit;
  1068. /* get the converter state from UConverter */
  1069. prev=(int32_t)cnv->toUnicodeStatus;
  1070. if(prev==0) {
  1071. prev=BOCU1_ASCII_PREV;
  1072. }
  1073. diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
  1074. count=diff&3;
  1075. diff>>=2;
  1076. byteIndex=cnv->toULength;
  1077. bytes=cnv->toUBytes;
  1078. /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
  1079. if(count>0 && byteIndex>0 && target<targetLimit) {
  1080. goto getTrail;
  1081. }
  1082. fastSingle:
  1083. /* fast loop for single-byte differences */
  1084. /* use count as the only loop counter variable */
  1085. diff=(int32_t)(sourceLimit-source);
  1086. count=(int32_t)(pArgs->targetLimit-target);
  1087. if(count>diff) {
  1088. count=diff;
  1089. }
  1090. while(count>0) {
  1091. if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
  1092. c=prev+(c-BOCU1_MIDDLE);
  1093. if(c<0x3000) {
  1094. *target++=(char16_t)c;
  1095. prev=BOCU1_SIMPLE_PREV(c);
  1096. } else {
  1097. break;
  1098. }
  1099. } else if(c<=0x20) {
  1100. if(c!=0x20) {
  1101. prev=BOCU1_ASCII_PREV;
  1102. }
  1103. *target++=(char16_t)c;
  1104. } else {
  1105. break;
  1106. }
  1107. ++source;
  1108. --count;
  1109. }
  1110. /* decode a sequence of single and lead bytes */
  1111. while(source<sourceLimit) {
  1112. if(target>=targetLimit) {
  1113. /* target is full */
  1114. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1115. break;
  1116. }
  1117. c=*source++;
  1118. if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
  1119. /* Write a code point directly from a single-byte difference. */
  1120. c=prev+(c-BOCU1_MIDDLE);
  1121. if(c<0x3000) {
  1122. *target++=(char16_t)c;
  1123. prev=BOCU1_SIMPLE_PREV(c);
  1124. goto fastSingle;
  1125. }
  1126. } else if(c<=0x20) {
  1127. /*
  1128. * Direct-encoded C0 control code or space.
  1129. * Reset prev for C0 control codes but not for space.
  1130. */
  1131. if(c!=0x20) {
  1132. prev=BOCU1_ASCII_PREV;
  1133. }
  1134. *target++=(char16_t)c;
  1135. continue;
  1136. } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
  1137. /* Optimize two-byte case. */
  1138. if(c>=BOCU1_MIDDLE) {
  1139. diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
  1140. } else {
  1141. diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
  1142. }
  1143. /* trail byte */
  1144. c=decodeBocu1TrailByte(1, *source++);
  1145. if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
  1146. bytes[0]=source[-2];
  1147. bytes[1]=source[-1];
  1148. byteIndex=2;
  1149. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1150. break;
  1151. }
  1152. } else if(c==BOCU1_RESET) {
  1153. /* only reset the state, no code point */
  1154. prev=BOCU1_ASCII_PREV;
  1155. continue;
  1156. } else {
  1157. /*
  1158. * For multi-byte difference lead bytes, set the decoder state
  1159. * with the partial difference value from the lead byte and
  1160. * with the number of trail bytes.
  1161. */
  1162. bytes[0]=(uint8_t)c;
  1163. byteIndex=1;
  1164. diff=decodeBocu1LeadByte(c);
  1165. count=diff&3;
  1166. diff>>=2;
  1167. getTrail:
  1168. for(;;) {
  1169. if(source>=sourceLimit) {
  1170. goto endloop;
  1171. }
  1172. c=bytes[byteIndex++]=*source++;
  1173. /* trail byte in any position */
  1174. c=decodeBocu1TrailByte(count, c);
  1175. if(c<0) {
  1176. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1177. goto endloop;
  1178. }
  1179. diff+=c;
  1180. if(--count==0) {
  1181. /* final trail byte, deliver a code point */
  1182. byteIndex=0;
  1183. c=prev+diff;
  1184. if((uint32_t)c>0x10ffff) {
  1185. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1186. goto endloop;
  1187. }
  1188. break;
  1189. }
  1190. }
  1191. }
  1192. /* calculate the next prev and output c */
  1193. prev=BOCU1_PREV(c);
  1194. if(c<=0xffff) {
  1195. *target++=(char16_t)c;
  1196. } else {
  1197. /* output surrogate pair */
  1198. *target++=U16_LEAD(c);
  1199. if(target<targetLimit) {
  1200. *target++=U16_TRAIL(c);
  1201. } else {
  1202. /* target overflow */
  1203. cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
  1204. cnv->UCharErrorBufferLength=1;
  1205. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1206. break;
  1207. }
  1208. }
  1209. }
  1210. endloop:
  1211. if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
  1212. /* set the converter state in UConverter to deal with the next character */
  1213. cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
  1214. cnv->mode=0;
  1215. } else {
  1216. /* set the converter state back into UConverter */
  1217. cnv->toUnicodeStatus=(uint32_t)prev;
  1218. cnv->mode=((uint32_t)diff<<2)|count;
  1219. }
  1220. cnv->toULength=byteIndex;
  1221. /* write back the updated pointers */
  1222. pArgs->source=(const char *)source;
  1223. pArgs->target=target;
  1224. return;
  1225. }
  1226. /* miscellaneous ------------------------------------------------------------ */
  1227. static const UConverterImpl _Bocu1Impl={
  1228. UCNV_BOCU1,
  1229. nullptr,
  1230. nullptr,
  1231. nullptr,
  1232. nullptr,
  1233. nullptr,
  1234. _Bocu1ToUnicode,
  1235. _Bocu1ToUnicodeWithOffsets,
  1236. _Bocu1FromUnicode,
  1237. _Bocu1FromUnicodeWithOffsets,
  1238. nullptr,
  1239. nullptr,
  1240. nullptr,
  1241. nullptr,
  1242. nullptr,
  1243. ucnv_getCompleteUnicodeSet,
  1244. nullptr,
  1245. nullptr
  1246. };
  1247. static const UConverterStaticData _Bocu1StaticData={
  1248. sizeof(UConverterStaticData),
  1249. "BOCU-1",
  1250. 1214, /* CCSID for BOCU-1 */
  1251. UCNV_IBM, UCNV_BOCU1,
  1252. 1, 4, /* one char16_t generates at least 1 byte and at most 4 bytes */
  1253. { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
  1254. false, false,
  1255. 0,
  1256. 0,
  1257. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1258. };
  1259. const UConverterSharedData _Bocu1Data=
  1260. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
  1261. #endif