ucnvscsu.cpp 74 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2000-2016, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: ucnvscsu.c
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2000nov18
  16. * created by: Markus W. Scherer
  17. *
  18. * This is an implementation of the Standard Compression Scheme for Unicode
  19. * as defined in https://www.unicode.org/reports/tr6/ .
  20. * Reserved commands and window settings are treated as illegal sequences and
  21. * will result in callback calls.
  22. */
  23. #include "unicode/utypes.h"
  24. #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  25. #include "unicode/ucnv.h"
  26. #include "unicode/ucnv_cb.h"
  27. #include "unicode/utf16.h"
  28. #include "ucnv_bld.h"
  29. #include "ucnv_cnv.h"
  30. #include "cmemory.h"
  31. /* SCSU definitions --------------------------------------------------------- */
  32. /* SCSU command byte values */
  33. enum {
  34. SQ0=0x01, /* Quote from window pair 0 */
  35. SQ7=0x08, /* Quote from window pair 7 */
  36. SDX=0x0B, /* Define a window as extended */
  37. Srs=0x0C, /* reserved */
  38. SQU=0x0E, /* Quote a single Unicode character */
  39. SCU=0x0F, /* Change to Unicode mode */
  40. SC0=0x10, /* Select window 0 */
  41. SC7=0x17, /* Select window 7 */
  42. SD0=0x18, /* Define and select window 0 */
  43. SD7=0x1F, /* Define and select window 7 */
  44. UC0=0xE0, /* Select window 0 */
  45. UC7=0xE7, /* Select window 7 */
  46. UD0=0xE8, /* Define and select window 0 */
  47. UD7=0xEF, /* Define and select window 7 */
  48. UQU=0xF0, /* Quote a single Unicode character */
  49. UDX=0xF1, /* Define a Window as extended */
  50. Urs=0xF2 /* reserved */
  51. };
  52. enum {
  53. /*
  54. * Unicode code points from 3400 to E000 are not adressible by
  55. * dynamic window, since in these areas no short run alphabets are
  56. * found. Therefore add gapOffset to all values from gapThreshold.
  57. */
  58. gapThreshold=0x68,
  59. gapOffset=0xAC00,
  60. /* values between reservedStart and fixedThreshold are reserved */
  61. reservedStart=0xA8,
  62. /* use table of predefined fixed offsets for values from fixedThreshold */
  63. fixedThreshold=0xF9
  64. };
  65. /* constant offsets for the 8 static windows */
  66. static const uint32_t staticOffsets[8]={
  67. 0x0000, /* ASCII for quoted tags */
  68. 0x0080, /* Latin - 1 Supplement (for access to punctuation) */
  69. 0x0100, /* Latin Extended-A */
  70. 0x0300, /* Combining Diacritical Marks */
  71. 0x2000, /* General Punctuation */
  72. 0x2080, /* Currency Symbols */
  73. 0x2100, /* Letterlike Symbols and Number Forms */
  74. 0x3000 /* CJK Symbols and punctuation */
  75. };
  76. /* initial offsets for the 8 dynamic (sliding) windows */
  77. static const uint32_t initialDynamicOffsets[8]={
  78. 0x0080, /* Latin-1 */
  79. 0x00C0, /* Latin Extended A */
  80. 0x0400, /* Cyrillic */
  81. 0x0600, /* Arabic */
  82. 0x0900, /* Devanagari */
  83. 0x3040, /* Hiragana */
  84. 0x30A0, /* Katakana */
  85. 0xFF00 /* Fullwidth ASCII */
  86. };
  87. /* Table of fixed predefined Offsets */
  88. static const uint32_t fixedOffsets[]={
  89. /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */
  90. /* 0xFA */ 0x0250, /* IPA extensions */
  91. /* 0xFB */ 0x0370, /* Greek */
  92. /* 0xFC */ 0x0530, /* Armenian */
  93. /* 0xFD */ 0x3040, /* Hiragana */
  94. /* 0xFE */ 0x30A0, /* Katakana */
  95. /* 0xFF */ 0xFF60 /* Halfwidth Katakana */
  96. };
  97. /* state values */
  98. enum {
  99. readCommand,
  100. quotePairOne,
  101. quotePairTwo,
  102. quoteOne,
  103. definePairOne,
  104. definePairTwo,
  105. defineOne
  106. };
  107. typedef struct SCSUData {
  108. /* dynamic window offsets, initialize to default values from initialDynamicOffsets */
  109. uint32_t toUDynamicOffsets[8];
  110. uint32_t fromUDynamicOffsets[8];
  111. /* state machine state - toUnicode */
  112. UBool toUIsSingleByteMode;
  113. uint8_t toUState;
  114. int8_t toUQuoteWindow, toUDynamicWindow;
  115. uint8_t toUByteOne;
  116. uint8_t toUPadding[3];
  117. /* state machine state - fromUnicode */
  118. UBool fromUIsSingleByteMode;
  119. int8_t fromUDynamicWindow;
  120. /*
  121. * windowUse[] keeps track of the use of the dynamic windows:
  122. * At nextWindowUseIndex there is the least recently used window,
  123. * and the following windows (in a wrapping manner) are more and more
  124. * recently used.
  125. * At nextWindowUseIndex-1 there is the most recently used window.
  126. */
  127. uint8_t locale;
  128. int8_t nextWindowUseIndex;
  129. int8_t windowUse[8];
  130. } SCSUData;
  131. static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 };
  132. static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 };
  133. enum {
  134. lGeneric, l_ja
  135. };
  136. /* SCSU setup functions ----------------------------------------------------- */
  137. U_CDECL_BEGIN
  138. static void U_CALLCONV
  139. _SCSUReset(UConverter *cnv, UConverterResetChoice choice) {
  140. SCSUData *scsu=(SCSUData *)cnv->extraInfo;
  141. if(choice<=UCNV_RESET_TO_UNICODE) {
  142. /* reset toUnicode */
  143. uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32);
  144. scsu->toUIsSingleByteMode=true;
  145. scsu->toUState=readCommand;
  146. scsu->toUQuoteWindow=scsu->toUDynamicWindow=0;
  147. scsu->toUByteOne=0;
  148. cnv->toULength=0;
  149. }
  150. if(choice!=UCNV_RESET_TO_UNICODE) {
  151. /* reset fromUnicode */
  152. uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32);
  153. scsu->fromUIsSingleByteMode=true;
  154. scsu->fromUDynamicWindow=0;
  155. scsu->nextWindowUseIndex=0;
  156. switch(scsu->locale) {
  157. case l_ja:
  158. uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8);
  159. break;
  160. default:
  161. uprv_memcpy(scsu->windowUse, initialWindowUse, 8);
  162. break;
  163. }
  164. cnv->fromUChar32=0;
  165. }
  166. }
  167. static void U_CALLCONV
  168. _SCSUOpen(UConverter *cnv,
  169. UConverterLoadArgs *pArgs,
  170. UErrorCode *pErrorCode) {
  171. const char *locale=pArgs->locale;
  172. if(pArgs->onlyTestIsLoadable) {
  173. return;
  174. }
  175. cnv->extraInfo=uprv_malloc(sizeof(SCSUData));
  176. if(cnv->extraInfo!=nullptr) {
  177. if(locale!=nullptr && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) {
  178. ((SCSUData *)cnv->extraInfo)->locale=l_ja;
  179. } else {
  180. ((SCSUData *)cnv->extraInfo)->locale=lGeneric;
  181. }
  182. _SCSUReset(cnv, UCNV_RESET_BOTH);
  183. } else {
  184. *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
  185. }
  186. /* Set the substitution character U+fffd as a Unicode string. */
  187. cnv->subUChars[0]=0xfffd;
  188. cnv->subCharLen=-1;
  189. }
  190. static void U_CALLCONV
  191. _SCSUClose(UConverter *cnv) {
  192. if(cnv->extraInfo!=nullptr) {
  193. if(!cnv->isExtraLocal) {
  194. uprv_free(cnv->extraInfo);
  195. }
  196. cnv->extraInfo=nullptr;
  197. }
  198. }
  199. /* SCSU-to-Unicode conversion functions ------------------------------------- */
  200. static void U_CALLCONV
  201. _SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  202. UErrorCode *pErrorCode) {
  203. UConverter *cnv;
  204. SCSUData *scsu;
  205. const uint8_t *source, *sourceLimit;
  206. char16_t *target;
  207. const char16_t *targetLimit;
  208. int32_t *offsets;
  209. UBool isSingleByteMode;
  210. uint8_t state, byteOne;
  211. int8_t quoteWindow, dynamicWindow;
  212. int32_t sourceIndex, nextSourceIndex;
  213. uint8_t b;
  214. /* set up the local pointers */
  215. cnv=pArgs->converter;
  216. scsu=(SCSUData *)cnv->extraInfo;
  217. source=(const uint8_t *)pArgs->source;
  218. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  219. target=pArgs->target;
  220. targetLimit=pArgs->targetLimit;
  221. offsets=pArgs->offsets;
  222. /* get the state machine state */
  223. isSingleByteMode=scsu->toUIsSingleByteMode;
  224. state=scsu->toUState;
  225. quoteWindow=scsu->toUQuoteWindow;
  226. dynamicWindow=scsu->toUDynamicWindow;
  227. byteOne=scsu->toUByteOne;
  228. /* sourceIndex=-1 if the current character began in the previous buffer */
  229. sourceIndex=state==readCommand ? 0 : -1;
  230. nextSourceIndex=0;
  231. /*
  232. * conversion "loop"
  233. *
  234. * For performance, this is not a normal C loop.
  235. * Instead, there are two code blocks for the two SCSU modes.
  236. * The function branches to either one, and a change of the mode is done with a goto to
  237. * the other branch.
  238. *
  239. * Each branch has two conventional loops:
  240. * - a fast-path loop for the most common codes in the mode
  241. * - a loop for all other codes in the mode
  242. * When the fast-path runs into a code that it cannot handle, its loop ends and it
  243. * runs into the following loop to handle the other codes.
  244. * The end of the input or output buffer is also handled by the slower loop.
  245. * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
  246. *
  247. * The callback handling is done by returning with an error code.
  248. * The conversion framework actually calls the callback function.
  249. */
  250. if(isSingleByteMode) {
  251. /* fast path for single-byte mode */
  252. if(state==readCommand) {
  253. fastSingle:
  254. while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
  255. ++source;
  256. ++nextSourceIndex;
  257. if(b<=0x7f) {
  258. /* write US-ASCII graphic character or DEL */
  259. *target++=(char16_t)b;
  260. if(offsets!=nullptr) {
  261. *offsets++=sourceIndex;
  262. }
  263. } else {
  264. /* write from dynamic window */
  265. uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
  266. if(c<=0xffff) {
  267. *target++=(char16_t)c;
  268. if(offsets!=nullptr) {
  269. *offsets++=sourceIndex;
  270. }
  271. } else {
  272. /* output surrogate pair */
  273. *target++=(char16_t)(0xd7c0+(c>>10));
  274. if(target<targetLimit) {
  275. *target++=(char16_t)(0xdc00|(c&0x3ff));
  276. if(offsets!=nullptr) {
  277. *offsets++=sourceIndex;
  278. *offsets++=sourceIndex;
  279. }
  280. } else {
  281. /* target overflow */
  282. if(offsets!=nullptr) {
  283. *offsets++=sourceIndex;
  284. }
  285. cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
  286. cnv->UCharErrorBufferLength=1;
  287. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  288. goto endloop;
  289. }
  290. }
  291. }
  292. sourceIndex=nextSourceIndex;
  293. }
  294. }
  295. /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
  296. singleByteMode:
  297. while(source<sourceLimit) {
  298. if(target>=targetLimit) {
  299. /* target is full */
  300. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  301. break;
  302. }
  303. b=*source++;
  304. ++nextSourceIndex;
  305. switch(state) {
  306. case readCommand:
  307. /* redundant conditions are commented out */
  308. /* here: b<0x20 because otherwise we would be in fastSingle */
  309. if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
  310. /* CR/LF/TAB/NUL */
  311. *target++=(char16_t)b;
  312. if(offsets!=nullptr) {
  313. *offsets++=sourceIndex;
  314. }
  315. sourceIndex=nextSourceIndex;
  316. goto fastSingle;
  317. } else if(SC0<=b) {
  318. if(b<=SC7) {
  319. dynamicWindow=(int8_t)(b-SC0);
  320. sourceIndex=nextSourceIndex;
  321. goto fastSingle;
  322. } else /* if(SD0<=b && b<=SD7) */ {
  323. dynamicWindow=(int8_t)(b-SD0);
  324. state=defineOne;
  325. }
  326. } else if(/* SQ0<=b && */ b<=SQ7) {
  327. quoteWindow=(int8_t)(b-SQ0);
  328. state=quoteOne;
  329. } else if(b==SDX) {
  330. state=definePairOne;
  331. } else if(b==SQU) {
  332. state=quotePairOne;
  333. } else if(b==SCU) {
  334. sourceIndex=nextSourceIndex;
  335. isSingleByteMode=false;
  336. goto fastUnicode;
  337. } else /* Srs */ {
  338. /* callback(illegal) */
  339. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  340. cnv->toUBytes[0]=b;
  341. cnv->toULength=1;
  342. goto endloop;
  343. }
  344. /* store the first byte of a multibyte sequence in toUBytes[] */
  345. cnv->toUBytes[0]=b;
  346. cnv->toULength=1;
  347. break;
  348. case quotePairOne:
  349. byteOne=b;
  350. cnv->toUBytes[1]=b;
  351. cnv->toULength=2;
  352. state=quotePairTwo;
  353. break;
  354. case quotePairTwo:
  355. *target++=(char16_t)((byteOne<<8)|b);
  356. if(offsets!=nullptr) {
  357. *offsets++=sourceIndex;
  358. }
  359. sourceIndex=nextSourceIndex;
  360. state=readCommand;
  361. goto fastSingle;
  362. case quoteOne:
  363. if(b<0x80) {
  364. /* all static offsets are in the BMP */
  365. *target++=(char16_t)(staticOffsets[quoteWindow]+b);
  366. if(offsets!=nullptr) {
  367. *offsets++=sourceIndex;
  368. }
  369. } else {
  370. /* write from dynamic window */
  371. uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
  372. if(c<=0xffff) {
  373. *target++=(char16_t)c;
  374. if(offsets!=nullptr) {
  375. *offsets++=sourceIndex;
  376. }
  377. } else {
  378. /* output surrogate pair */
  379. *target++=(char16_t)(0xd7c0+(c>>10));
  380. if(target<targetLimit) {
  381. *target++=(char16_t)(0xdc00|(c&0x3ff));
  382. if(offsets!=nullptr) {
  383. *offsets++=sourceIndex;
  384. *offsets++=sourceIndex;
  385. }
  386. } else {
  387. /* target overflow */
  388. if(offsets!=nullptr) {
  389. *offsets++=sourceIndex;
  390. }
  391. cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
  392. cnv->UCharErrorBufferLength=1;
  393. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  394. goto endloop;
  395. }
  396. }
  397. }
  398. sourceIndex=nextSourceIndex;
  399. state=readCommand;
  400. goto fastSingle;
  401. case definePairOne:
  402. dynamicWindow=(int8_t)((b>>5)&7);
  403. byteOne=(uint8_t)(b&0x1f);
  404. cnv->toUBytes[1]=b;
  405. cnv->toULength=2;
  406. state=definePairTwo;
  407. break;
  408. case definePairTwo:
  409. scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
  410. sourceIndex=nextSourceIndex;
  411. state=readCommand;
  412. goto fastSingle;
  413. case defineOne:
  414. if(b==0) {
  415. /* callback(illegal): Reserved window offset value 0 */
  416. cnv->toUBytes[1]=b;
  417. cnv->toULength=2;
  418. goto endloop;
  419. } else if(b<gapThreshold) {
  420. scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
  421. } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
  422. scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
  423. } else if(b>=fixedThreshold) {
  424. scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
  425. } else {
  426. /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
  427. cnv->toUBytes[1]=b;
  428. cnv->toULength=2;
  429. goto endloop;
  430. }
  431. sourceIndex=nextSourceIndex;
  432. state=readCommand;
  433. goto fastSingle;
  434. }
  435. }
  436. } else {
  437. /* fast path for Unicode mode */
  438. if(state==readCommand) {
  439. fastUnicode:
  440. while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
  441. *target++=(char16_t)((b<<8)|source[1]);
  442. if(offsets!=nullptr) {
  443. *offsets++=sourceIndex;
  444. }
  445. sourceIndex=nextSourceIndex;
  446. nextSourceIndex+=2;
  447. source+=2;
  448. }
  449. }
  450. /* normal state machine for Unicode mode */
  451. /* unicodeByteMode: */
  452. while(source<sourceLimit) {
  453. if(target>=targetLimit) {
  454. /* target is full */
  455. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  456. break;
  457. }
  458. b=*source++;
  459. ++nextSourceIndex;
  460. switch(state) {
  461. case readCommand:
  462. if((uint8_t)(b-UC0)>(Urs-UC0)) {
  463. byteOne=b;
  464. cnv->toUBytes[0]=b;
  465. cnv->toULength=1;
  466. state=quotePairTwo;
  467. } else if(/* UC0<=b && */ b<=UC7) {
  468. dynamicWindow=(int8_t)(b-UC0);
  469. sourceIndex=nextSourceIndex;
  470. isSingleByteMode=true;
  471. goto fastSingle;
  472. } else if(/* UD0<=b && */ b<=UD7) {
  473. dynamicWindow=(int8_t)(b-UD0);
  474. isSingleByteMode=true;
  475. cnv->toUBytes[0]=b;
  476. cnv->toULength=1;
  477. state=defineOne;
  478. goto singleByteMode;
  479. } else if(b==UDX) {
  480. isSingleByteMode=true;
  481. cnv->toUBytes[0]=b;
  482. cnv->toULength=1;
  483. state=definePairOne;
  484. goto singleByteMode;
  485. } else if(b==UQU) {
  486. cnv->toUBytes[0]=b;
  487. cnv->toULength=1;
  488. state=quotePairOne;
  489. } else /* Urs */ {
  490. /* callback(illegal) */
  491. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  492. cnv->toUBytes[0]=b;
  493. cnv->toULength=1;
  494. goto endloop;
  495. }
  496. break;
  497. case quotePairOne:
  498. byteOne=b;
  499. cnv->toUBytes[1]=b;
  500. cnv->toULength=2;
  501. state=quotePairTwo;
  502. break;
  503. case quotePairTwo:
  504. *target++=(char16_t)((byteOne<<8)|b);
  505. if(offsets!=nullptr) {
  506. *offsets++=sourceIndex;
  507. }
  508. sourceIndex=nextSourceIndex;
  509. state=readCommand;
  510. goto fastUnicode;
  511. }
  512. }
  513. }
  514. endloop:
  515. /* set the converter state back into UConverter */
  516. if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
  517. /* reset to deal with the next character */
  518. state=readCommand;
  519. } else if(state==readCommand) {
  520. /* not in a multi-byte sequence, reset toULength */
  521. cnv->toULength=0;
  522. }
  523. scsu->toUIsSingleByteMode=isSingleByteMode;
  524. scsu->toUState=state;
  525. scsu->toUQuoteWindow=quoteWindow;
  526. scsu->toUDynamicWindow=dynamicWindow;
  527. scsu->toUByteOne=byteOne;
  528. /* write back the updated pointers */
  529. pArgs->source=(const char *)source;
  530. pArgs->target=target;
  531. pArgs->offsets=offsets;
  532. return;
  533. }
  534. /*
  535. * Identical to _SCSUToUnicodeWithOffsets but without offset handling.
  536. * If a change is made in the original function, then either
  537. * change this function the same way or
  538. * re-copy the original function and remove the variables
  539. * offsets, sourceIndex, and nextSourceIndex.
  540. */
  541. static void U_CALLCONV
  542. _SCSUToUnicode(UConverterToUnicodeArgs *pArgs,
  543. UErrorCode *pErrorCode) {
  544. UConverter *cnv;
  545. SCSUData *scsu;
  546. const uint8_t *source, *sourceLimit;
  547. char16_t *target;
  548. const char16_t *targetLimit;
  549. UBool isSingleByteMode;
  550. uint8_t state, byteOne;
  551. int8_t quoteWindow, dynamicWindow;
  552. uint8_t b;
  553. /* set up the local pointers */
  554. cnv=pArgs->converter;
  555. scsu=(SCSUData *)cnv->extraInfo;
  556. source=(const uint8_t *)pArgs->source;
  557. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  558. target=pArgs->target;
  559. targetLimit=pArgs->targetLimit;
  560. /* get the state machine state */
  561. isSingleByteMode=scsu->toUIsSingleByteMode;
  562. state=scsu->toUState;
  563. quoteWindow=scsu->toUQuoteWindow;
  564. dynamicWindow=scsu->toUDynamicWindow;
  565. byteOne=scsu->toUByteOne;
  566. /*
  567. * conversion "loop"
  568. *
  569. * For performance, this is not a normal C loop.
  570. * Instead, there are two code blocks for the two SCSU modes.
  571. * The function branches to either one, and a change of the mode is done with a goto to
  572. * the other branch.
  573. *
  574. * Each branch has two conventional loops:
  575. * - a fast-path loop for the most common codes in the mode
  576. * - a loop for all other codes in the mode
  577. * When the fast-path runs into a code that it cannot handle, its loop ends and it
  578. * runs into the following loop to handle the other codes.
  579. * The end of the input or output buffer is also handled by the slower loop.
  580. * The slow loop jumps (goto) to the fast-path loop again as soon as possible.
  581. *
  582. * The callback handling is done by returning with an error code.
  583. * The conversion framework actually calls the callback function.
  584. */
  585. if(isSingleByteMode) {
  586. /* fast path for single-byte mode */
  587. if(state==readCommand) {
  588. fastSingle:
  589. while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) {
  590. ++source;
  591. if(b<=0x7f) {
  592. /* write US-ASCII graphic character or DEL */
  593. *target++=(char16_t)b;
  594. } else {
  595. /* write from dynamic window */
  596. uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f);
  597. if(c<=0xffff) {
  598. *target++=(char16_t)c;
  599. } else {
  600. /* output surrogate pair */
  601. *target++=(char16_t)(0xd7c0+(c>>10));
  602. if(target<targetLimit) {
  603. *target++=(char16_t)(0xdc00|(c&0x3ff));
  604. } else {
  605. /* target overflow */
  606. cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
  607. cnv->UCharErrorBufferLength=1;
  608. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  609. goto endloop;
  610. }
  611. }
  612. }
  613. }
  614. }
  615. /* normal state machine for single-byte mode, minus handling for what fastSingle covers */
  616. singleByteMode:
  617. while(source<sourceLimit) {
  618. if(target>=targetLimit) {
  619. /* target is full */
  620. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  621. break;
  622. }
  623. b=*source++;
  624. switch(state) {
  625. case readCommand:
  626. /* redundant conditions are commented out */
  627. /* here: b<0x20 because otherwise we would be in fastSingle */
  628. if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
  629. /* CR/LF/TAB/NUL */
  630. *target++=(char16_t)b;
  631. goto fastSingle;
  632. } else if(SC0<=b) {
  633. if(b<=SC7) {
  634. dynamicWindow=(int8_t)(b-SC0);
  635. goto fastSingle;
  636. } else /* if(SD0<=b && b<=SD7) */ {
  637. dynamicWindow=(int8_t)(b-SD0);
  638. state=defineOne;
  639. }
  640. } else if(/* SQ0<=b && */ b<=SQ7) {
  641. quoteWindow=(int8_t)(b-SQ0);
  642. state=quoteOne;
  643. } else if(b==SDX) {
  644. state=definePairOne;
  645. } else if(b==SQU) {
  646. state=quotePairOne;
  647. } else if(b==SCU) {
  648. isSingleByteMode=false;
  649. goto fastUnicode;
  650. } else /* Srs */ {
  651. /* callback(illegal) */
  652. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  653. cnv->toUBytes[0]=b;
  654. cnv->toULength=1;
  655. goto endloop;
  656. }
  657. /* store the first byte of a multibyte sequence in toUBytes[] */
  658. cnv->toUBytes[0]=b;
  659. cnv->toULength=1;
  660. break;
  661. case quotePairOne:
  662. byteOne=b;
  663. cnv->toUBytes[1]=b;
  664. cnv->toULength=2;
  665. state=quotePairTwo;
  666. break;
  667. case quotePairTwo:
  668. *target++=(char16_t)((byteOne<<8)|b);
  669. state=readCommand;
  670. goto fastSingle;
  671. case quoteOne:
  672. if(b<0x80) {
  673. /* all static offsets are in the BMP */
  674. *target++=(char16_t)(staticOffsets[quoteWindow]+b);
  675. } else {
  676. /* write from dynamic window */
  677. uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f);
  678. if(c<=0xffff) {
  679. *target++=(char16_t)c;
  680. } else {
  681. /* output surrogate pair */
  682. *target++=(char16_t)(0xd7c0+(c>>10));
  683. if(target<targetLimit) {
  684. *target++=(char16_t)(0xdc00|(c&0x3ff));
  685. } else {
  686. /* target overflow */
  687. cnv->UCharErrorBuffer[0]=(char16_t)(0xdc00|(c&0x3ff));
  688. cnv->UCharErrorBufferLength=1;
  689. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  690. goto endloop;
  691. }
  692. }
  693. }
  694. state=readCommand;
  695. goto fastSingle;
  696. case definePairOne:
  697. dynamicWindow=(int8_t)((b>>5)&7);
  698. byteOne=(uint8_t)(b&0x1f);
  699. cnv->toUBytes[1]=b;
  700. cnv->toULength=2;
  701. state=definePairTwo;
  702. break;
  703. case definePairTwo:
  704. scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL);
  705. state=readCommand;
  706. goto fastSingle;
  707. case defineOne:
  708. if(b==0) {
  709. /* callback(illegal): Reserved window offset value 0 */
  710. cnv->toUBytes[1]=b;
  711. cnv->toULength=2;
  712. goto endloop;
  713. } else if(b<gapThreshold) {
  714. scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL;
  715. } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) {
  716. scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset;
  717. } else if(b>=fixedThreshold) {
  718. scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold];
  719. } else {
  720. /* callback(illegal): Reserved window offset value 0xa8..0xf8 */
  721. cnv->toUBytes[1]=b;
  722. cnv->toULength=2;
  723. goto endloop;
  724. }
  725. state=readCommand;
  726. goto fastSingle;
  727. }
  728. }
  729. } else {
  730. /* fast path for Unicode mode */
  731. if(state==readCommand) {
  732. fastUnicode:
  733. while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) {
  734. *target++=(char16_t)((b<<8)|source[1]);
  735. source+=2;
  736. }
  737. }
  738. /* normal state machine for Unicode mode */
  739. /* unicodeByteMode: */
  740. while(source<sourceLimit) {
  741. if(target>=targetLimit) {
  742. /* target is full */
  743. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  744. break;
  745. }
  746. b=*source++;
  747. switch(state) {
  748. case readCommand:
  749. if((uint8_t)(b-UC0)>(Urs-UC0)) {
  750. byteOne=b;
  751. cnv->toUBytes[0]=b;
  752. cnv->toULength=1;
  753. state=quotePairTwo;
  754. } else if(/* UC0<=b && */ b<=UC7) {
  755. dynamicWindow=(int8_t)(b-UC0);
  756. isSingleByteMode=true;
  757. goto fastSingle;
  758. } else if(/* UD0<=b && */ b<=UD7) {
  759. dynamicWindow=(int8_t)(b-UD0);
  760. isSingleByteMode=true;
  761. cnv->toUBytes[0]=b;
  762. cnv->toULength=1;
  763. state=defineOne;
  764. goto singleByteMode;
  765. } else if(b==UDX) {
  766. isSingleByteMode=true;
  767. cnv->toUBytes[0]=b;
  768. cnv->toULength=1;
  769. state=definePairOne;
  770. goto singleByteMode;
  771. } else if(b==UQU) {
  772. cnv->toUBytes[0]=b;
  773. cnv->toULength=1;
  774. state=quotePairOne;
  775. } else /* Urs */ {
  776. /* callback(illegal) */
  777. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  778. cnv->toUBytes[0]=b;
  779. cnv->toULength=1;
  780. goto endloop;
  781. }
  782. break;
  783. case quotePairOne:
  784. byteOne=b;
  785. cnv->toUBytes[1]=b;
  786. cnv->toULength=2;
  787. state=quotePairTwo;
  788. break;
  789. case quotePairTwo:
  790. *target++=(char16_t)((byteOne<<8)|b);
  791. state=readCommand;
  792. goto fastUnicode;
  793. }
  794. }
  795. }
  796. endloop:
  797. /* set the converter state back into UConverter */
  798. if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
  799. /* reset to deal with the next character */
  800. state=readCommand;
  801. } else if(state==readCommand) {
  802. /* not in a multi-byte sequence, reset toULength */
  803. cnv->toULength=0;
  804. }
  805. scsu->toUIsSingleByteMode=isSingleByteMode;
  806. scsu->toUState=state;
  807. scsu->toUQuoteWindow=quoteWindow;
  808. scsu->toUDynamicWindow=dynamicWindow;
  809. scsu->toUByteOne=byteOne;
  810. /* write back the updated pointers */
  811. pArgs->source=(const char *)source;
  812. pArgs->target=target;
  813. return;
  814. }
  815. U_CDECL_END
  816. /* SCSU-from-Unicode conversion functions ----------------------------------- */
  817. /*
  818. * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve
  819. * reasonable results. The lookahead is minimal.
  820. * Many cases are simple:
  821. * A character fits directly into the current mode, a dynamic or static window,
  822. * or is not compressible. These cases are tested first.
  823. * Real compression heuristics are applied to the rest, in code branches for
  824. * single/Unicode mode and BMP/supplementary code points.
  825. * The heuristics used here are extremely simple.
  826. */
  827. /* get the number of the window that this character is in, or -1 */
  828. static int8_t
  829. getWindow(const uint32_t offsets[8], uint32_t c) {
  830. int i;
  831. for(i=0; i<8; ++i) {
  832. if((uint32_t)(c-offsets[i])<=0x7f) {
  833. return (int8_t)(i);
  834. }
  835. }
  836. return -1;
  837. }
  838. /* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */
  839. static UBool
  840. isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) {
  841. return (UBool)(c<=offset+0x7f &&
  842. (c>=offset || (c<=0x7f &&
  843. (c>=0x20 || (1UL<<c)&0x2601))));
  844. /* binary 0010 0110 0000 0001,
  845. check for b==0xd || b==0xa || b==9 || b==0 */
  846. }
  847. /*
  848. * getNextDynamicWindow returns the next dynamic window to be redefined
  849. */
  850. static int8_t
  851. getNextDynamicWindow(SCSUData *scsu) {
  852. int8_t window=scsu->windowUse[scsu->nextWindowUseIndex];
  853. if(++scsu->nextWindowUseIndex==8) {
  854. scsu->nextWindowUseIndex=0;
  855. }
  856. return window;
  857. }
  858. /*
  859. * useDynamicWindow() adjusts
  860. * windowUse[] and nextWindowUseIndex for the algorithm to choose
  861. * the next dynamic window to be defined;
  862. * a subclass may override it and provide its own algorithm.
  863. */
  864. static void
  865. useDynamicWindow(SCSUData *scsu, int8_t window) {
  866. /*
  867. * move the existing window, which just became the most recently used one,
  868. * up in windowUse[] to nextWindowUseIndex-1
  869. */
  870. /* first, find the index of the window - backwards to favor the more recently used windows */
  871. int i, j;
  872. i=scsu->nextWindowUseIndex;
  873. do {
  874. if(--i<0) {
  875. i=7;
  876. }
  877. } while(scsu->windowUse[i]!=window);
  878. /* now copy each windowUse[i+1] to [i] */
  879. j=i+1;
  880. if(j==8) {
  881. j=0;
  882. }
  883. while(j!=scsu->nextWindowUseIndex) {
  884. scsu->windowUse[i]=scsu->windowUse[j];
  885. i=j;
  886. if(++j==8) { j=0; }
  887. }
  888. /* finally, set the window into the most recently used index */
  889. scsu->windowUse[i]=window;
  890. }
  891. /*
  892. * calculate the offset and the code for a dynamic window that contains the character
  893. * takes fixed offsets into account
  894. * the offset of the window is stored in the offset variable,
  895. * the code is returned
  896. *
  897. * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code
  898. */
  899. static int
  900. getDynamicOffset(uint32_t c, uint32_t *pOffset) {
  901. int i;
  902. for(i=0; i<7; ++i) {
  903. if((uint32_t)(c-fixedOffsets[i])<=0x7f) {
  904. *pOffset=fixedOffsets[i];
  905. return 0xf9+i;
  906. }
  907. }
  908. if(c<0x80) {
  909. /* No dynamic window for US-ASCII. */
  910. return -1;
  911. } else if(c<0x3400 ||
  912. (uint32_t)(c-0x10000)<(0x14000-0x10000) ||
  913. (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000)
  914. ) {
  915. /* This character is in a code range for a "small", i.e., reasonably windowable, script. */
  916. *pOffset=c&0x7fffff80;
  917. return (int)(c>>7);
  918. } else if(0xe000<=c && c!=0xfeff && c<0xfff0) {
  919. /* For these characters we need to take the gapOffset into account. */
  920. *pOffset=c&0x7fffff80;
  921. return (int)((c-gapOffset)>>7);
  922. } else {
  923. return -1;
  924. }
  925. }
  926. U_CDECL_BEGIN
  927. /*
  928. * Idea for compression:
  929. * - save SCSUData and other state before really starting work
  930. * - at endloop, see if compression could be better with just unicode mode
  931. * - don't do this if a callback has been called
  932. * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning
  933. * - different buffer handling!
  934. *
  935. * Drawback or need for corrective handling:
  936. * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and
  937. * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible
  938. * not only for compression but also for HTML/XML documents with following charset/encoding announcers.
  939. *
  940. * How to achieve both?
  941. * - Only replace the result after an SDX or SCU?
  942. */
  943. static void U_CALLCONV
  944. _SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  945. UErrorCode *pErrorCode) {
  946. UConverter *cnv;
  947. SCSUData *scsu;
  948. const char16_t *source, *sourceLimit;
  949. uint8_t *target;
  950. int32_t targetCapacity;
  951. int32_t *offsets;
  952. UBool isSingleByteMode;
  953. uint8_t dynamicWindow;
  954. uint32_t currentOffset;
  955. uint32_t c, delta;
  956. int32_t sourceIndex, nextSourceIndex;
  957. int32_t length;
  958. /* variables for compression heuristics */
  959. uint32_t offset;
  960. char16_t lead, trail;
  961. int code;
  962. int8_t window;
  963. /* set up the local pointers */
  964. cnv=pArgs->converter;
  965. scsu=(SCSUData *)cnv->extraInfo;
  966. /* set up the local pointers */
  967. source=pArgs->source;
  968. sourceLimit=pArgs->sourceLimit;
  969. target=(uint8_t *)pArgs->target;
  970. targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
  971. offsets=pArgs->offsets;
  972. /* get the state machine state */
  973. isSingleByteMode=scsu->fromUIsSingleByteMode;
  974. dynamicWindow=scsu->fromUDynamicWindow;
  975. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  976. c=cnv->fromUChar32;
  977. /* sourceIndex=-1 if the current character began in the previous buffer */
  978. sourceIndex= c==0 ? 0 : -1;
  979. nextSourceIndex=0;
  980. /* similar conversion "loop" as in toUnicode */
  981. loop:
  982. if(isSingleByteMode) {
  983. if(c!=0 && targetCapacity>0) {
  984. goto getTrailSingle;
  985. }
  986. /* state machine for single-byte mode */
  987. /* singleByteMode: */
  988. while(source<sourceLimit) {
  989. if(targetCapacity<=0) {
  990. /* target is full */
  991. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  992. break;
  993. }
  994. c=*source++;
  995. ++nextSourceIndex;
  996. if((c-0x20)<=0x5f) {
  997. /* pass US-ASCII graphic character through */
  998. *target++=(uint8_t)c;
  999. if(offsets!=nullptr) {
  1000. *offsets++=sourceIndex;
  1001. }
  1002. --targetCapacity;
  1003. } else if(c<0x20) {
  1004. if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
  1005. /* CR/LF/TAB/NUL */
  1006. *target++=(uint8_t)c;
  1007. if(offsets!=nullptr) {
  1008. *offsets++=sourceIndex;
  1009. }
  1010. --targetCapacity;
  1011. } else {
  1012. /* quote C0 control character */
  1013. c|=SQ0<<8;
  1014. length=2;
  1015. goto outputBytes;
  1016. }
  1017. } else if((delta=c-currentOffset)<=0x7f) {
  1018. /* use the current dynamic window */
  1019. *target++=(uint8_t)(delta|0x80);
  1020. if(offsets!=nullptr) {
  1021. *offsets++=sourceIndex;
  1022. }
  1023. --targetCapacity;
  1024. } else if(U16_IS_SURROGATE(c)) {
  1025. if(U16_IS_SURROGATE_LEAD(c)) {
  1026. getTrailSingle:
  1027. lead=(char16_t)c;
  1028. if(source<sourceLimit) {
  1029. /* test the following code unit */
  1030. trail=*source;
  1031. if(U16_IS_TRAIL(trail)) {
  1032. ++source;
  1033. ++nextSourceIndex;
  1034. c=U16_GET_SUPPLEMENTARY(c, trail);
  1035. /* convert this surrogate code point */
  1036. /* exit this condition tree */
  1037. } else {
  1038. /* this is an unmatched lead code unit (1st surrogate) */
  1039. /* callback(illegal) */
  1040. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1041. goto endloop;
  1042. }
  1043. } else {
  1044. /* no more input */
  1045. break;
  1046. }
  1047. } else {
  1048. /* this is an unmatched trail code unit (2nd surrogate) */
  1049. /* callback(illegal) */
  1050. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1051. goto endloop;
  1052. }
  1053. /* compress supplementary character U+10000..U+10ffff */
  1054. if((delta=c-currentOffset)<=0x7f) {
  1055. /* use the current dynamic window */
  1056. *target++=(uint8_t)(delta|0x80);
  1057. if(offsets!=nullptr) {
  1058. *offsets++=sourceIndex;
  1059. }
  1060. --targetCapacity;
  1061. } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1062. /* there is a dynamic window that contains this character, change to it */
  1063. dynamicWindow=window;
  1064. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1065. useDynamicWindow(scsu, dynamicWindow);
  1066. c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1067. length=2;
  1068. goto outputBytes;
  1069. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1070. /* might check if there are more characters in this window to come */
  1071. /* define an extended window with this character */
  1072. code-=0x200;
  1073. dynamicWindow=getNextDynamicWindow(scsu);
  1074. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1075. useDynamicWindow(scsu, dynamicWindow);
  1076. c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1077. length=4;
  1078. goto outputBytes;
  1079. } else {
  1080. /* change to Unicode mode and output this (lead, trail) pair */
  1081. isSingleByteMode=false;
  1082. *target++=(uint8_t)SCU;
  1083. if(offsets!=nullptr) {
  1084. *offsets++=sourceIndex;
  1085. }
  1086. --targetCapacity;
  1087. c=((uint32_t)lead<<16)|trail;
  1088. length=4;
  1089. goto outputBytes;
  1090. }
  1091. } else if(c<0xa0) {
  1092. /* quote C1 control character */
  1093. c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
  1094. length=2;
  1095. goto outputBytes;
  1096. } else if(c==0xfeff || c>=0xfff0) {
  1097. /* quote signature character=byte order mark and specials */
  1098. c|=SQU<<16;
  1099. length=3;
  1100. goto outputBytes;
  1101. } else {
  1102. /* compress all other BMP characters */
  1103. if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1104. /* there is a window defined that contains this character - switch to it or quote from it? */
  1105. if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
  1106. /* change to dynamic window */
  1107. dynamicWindow=window;
  1108. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1109. useDynamicWindow(scsu, dynamicWindow);
  1110. c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1111. length=2;
  1112. goto outputBytes;
  1113. } else {
  1114. /* quote from dynamic window */
  1115. c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
  1116. length=2;
  1117. goto outputBytes;
  1118. }
  1119. } else if((window=getWindow(staticOffsets, c))>=0) {
  1120. /* quote from static window */
  1121. c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
  1122. length=2;
  1123. goto outputBytes;
  1124. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1125. /* define a dynamic window with this character */
  1126. dynamicWindow=getNextDynamicWindow(scsu);
  1127. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1128. useDynamicWindow(scsu, dynamicWindow);
  1129. c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1130. length=3;
  1131. goto outputBytes;
  1132. } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
  1133. (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
  1134. ) {
  1135. /*
  1136. * this character is not compressible (a BMP ideograph or similar);
  1137. * switch to Unicode mode if this is the last character in the block
  1138. * or there is at least one more ideograph following immediately
  1139. */
  1140. isSingleByteMode=false;
  1141. c|=SCU<<16;
  1142. length=3;
  1143. goto outputBytes;
  1144. } else {
  1145. /* quote Unicode */
  1146. c|=SQU<<16;
  1147. length=3;
  1148. goto outputBytes;
  1149. }
  1150. }
  1151. /* normal end of conversion: prepare for a new character */
  1152. c=0;
  1153. sourceIndex=nextSourceIndex;
  1154. }
  1155. } else {
  1156. if(c!=0 && targetCapacity>0) {
  1157. goto getTrailUnicode;
  1158. }
  1159. /* state machine for Unicode mode */
  1160. /* unicodeByteMode: */
  1161. while(source<sourceLimit) {
  1162. if(targetCapacity<=0) {
  1163. /* target is full */
  1164. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1165. break;
  1166. }
  1167. c=*source++;
  1168. ++nextSourceIndex;
  1169. if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
  1170. /* not compressible, write character directly */
  1171. if(targetCapacity>=2) {
  1172. *target++=(uint8_t)(c>>8);
  1173. *target++=(uint8_t)c;
  1174. if(offsets!=nullptr) {
  1175. *offsets++=sourceIndex;
  1176. *offsets++=sourceIndex;
  1177. }
  1178. targetCapacity-=2;
  1179. } else {
  1180. length=2;
  1181. goto outputBytes;
  1182. }
  1183. } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
  1184. /* compress BMP character if the following one is not an uncompressible ideograph */
  1185. if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
  1186. if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
  1187. /* ASCII digit or letter */
  1188. isSingleByteMode=true;
  1189. c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
  1190. length=2;
  1191. goto outputBytes;
  1192. } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1193. /* there is a dynamic window that contains this character, change to it */
  1194. isSingleByteMode=true;
  1195. dynamicWindow=window;
  1196. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1197. useDynamicWindow(scsu, dynamicWindow);
  1198. c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1199. length=2;
  1200. goto outputBytes;
  1201. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1202. /* define a dynamic window with this character */
  1203. isSingleByteMode=true;
  1204. dynamicWindow=getNextDynamicWindow(scsu);
  1205. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1206. useDynamicWindow(scsu, dynamicWindow);
  1207. c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1208. length=3;
  1209. goto outputBytes;
  1210. }
  1211. }
  1212. /* don't know how to compress this character, just write it directly */
  1213. length=2;
  1214. goto outputBytes;
  1215. } else if(c<0xe000) {
  1216. /* c is a surrogate */
  1217. if(U16_IS_SURROGATE_LEAD(c)) {
  1218. getTrailUnicode:
  1219. lead=(char16_t)c;
  1220. if(source<sourceLimit) {
  1221. /* test the following code unit */
  1222. trail=*source;
  1223. if(U16_IS_TRAIL(trail)) {
  1224. ++source;
  1225. ++nextSourceIndex;
  1226. c=U16_GET_SUPPLEMENTARY(c, trail);
  1227. /* convert this surrogate code point */
  1228. /* exit this condition tree */
  1229. } else {
  1230. /* this is an unmatched lead code unit (1st surrogate) */
  1231. /* callback(illegal) */
  1232. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1233. goto endloop;
  1234. }
  1235. } else {
  1236. /* no more input */
  1237. break;
  1238. }
  1239. } else {
  1240. /* this is an unmatched trail code unit (2nd surrogate) */
  1241. /* callback(illegal) */
  1242. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1243. goto endloop;
  1244. }
  1245. /* compress supplementary character */
  1246. if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
  1247. !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
  1248. ) {
  1249. /*
  1250. * there is a dynamic window that contains this character and
  1251. * the following character is not uncompressible,
  1252. * change to the window
  1253. */
  1254. isSingleByteMode=true;
  1255. dynamicWindow=window;
  1256. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1257. useDynamicWindow(scsu, dynamicWindow);
  1258. c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1259. length=2;
  1260. goto outputBytes;
  1261. } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
  1262. (code=getDynamicOffset(c, &offset))>=0
  1263. ) {
  1264. /* two supplementary characters in (probably) the same window - define an extended one */
  1265. isSingleByteMode=true;
  1266. code-=0x200;
  1267. dynamicWindow=getNextDynamicWindow(scsu);
  1268. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1269. useDynamicWindow(scsu, dynamicWindow);
  1270. c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1271. length=4;
  1272. goto outputBytes;
  1273. } else {
  1274. /* don't know how to compress this character, just write it directly */
  1275. c=((uint32_t)lead<<16)|trail;
  1276. length=4;
  1277. goto outputBytes;
  1278. }
  1279. } else /* 0xe000<=c<0xf300 */ {
  1280. /* quote to avoid SCSU tags */
  1281. c|=UQU<<16;
  1282. length=3;
  1283. goto outputBytes;
  1284. }
  1285. /* normal end of conversion: prepare for a new character */
  1286. c=0;
  1287. sourceIndex=nextSourceIndex;
  1288. }
  1289. }
  1290. endloop:
  1291. /* set the converter state back into UConverter */
  1292. scsu->fromUIsSingleByteMode=isSingleByteMode;
  1293. scsu->fromUDynamicWindow=dynamicWindow;
  1294. cnv->fromUChar32=c;
  1295. /* write back the updated pointers */
  1296. pArgs->source=source;
  1297. pArgs->target=(char *)target;
  1298. pArgs->offsets=offsets;
  1299. return;
  1300. outputBytes:
  1301. /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
  1302. /* from the first if in the loop we know that targetCapacity>0 */
  1303. if(length<=targetCapacity) {
  1304. if(offsets==nullptr) {
  1305. switch(length) {
  1306. /* each branch falls through to the next one */
  1307. case 4:
  1308. *target++=(uint8_t)(c>>24);
  1309. U_FALLTHROUGH;
  1310. case 3:
  1311. *target++=(uint8_t)(c>>16);
  1312. U_FALLTHROUGH;
  1313. case 2:
  1314. *target++=(uint8_t)(c>>8);
  1315. U_FALLTHROUGH;
  1316. case 1:
  1317. *target++=(uint8_t)c;
  1318. U_FALLTHROUGH;
  1319. default:
  1320. /* will never occur */
  1321. break;
  1322. }
  1323. } else {
  1324. switch(length) {
  1325. /* each branch falls through to the next one */
  1326. case 4:
  1327. *target++=(uint8_t)(c>>24);
  1328. *offsets++=sourceIndex;
  1329. U_FALLTHROUGH;
  1330. case 3:
  1331. *target++=(uint8_t)(c>>16);
  1332. *offsets++=sourceIndex;
  1333. U_FALLTHROUGH;
  1334. case 2:
  1335. *target++=(uint8_t)(c>>8);
  1336. *offsets++=sourceIndex;
  1337. U_FALLTHROUGH;
  1338. case 1:
  1339. *target++=(uint8_t)c;
  1340. *offsets++=sourceIndex;
  1341. U_FALLTHROUGH;
  1342. default:
  1343. /* will never occur */
  1344. break;
  1345. }
  1346. }
  1347. targetCapacity-=length;
  1348. /* normal end of conversion: prepare for a new character */
  1349. c=0;
  1350. sourceIndex=nextSourceIndex;
  1351. goto loop;
  1352. } else {
  1353. uint8_t *p;
  1354. /*
  1355. * We actually do this backwards here:
  1356. * In order to save an intermediate variable, we output
  1357. * first to the overflow buffer what does not fit into the
  1358. * regular target.
  1359. */
  1360. /* we know that 0<=targetCapacity<length<=4 */
  1361. /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
  1362. length-=targetCapacity;
  1363. p=(uint8_t *)cnv->charErrorBuffer;
  1364. switch(length) {
  1365. /* each branch falls through to the next one */
  1366. case 4:
  1367. *p++=(uint8_t)(c>>24);
  1368. U_FALLTHROUGH;
  1369. case 3:
  1370. *p++=(uint8_t)(c>>16);
  1371. U_FALLTHROUGH;
  1372. case 2:
  1373. *p++=(uint8_t)(c>>8);
  1374. U_FALLTHROUGH;
  1375. case 1:
  1376. *p=(uint8_t)c;
  1377. U_FALLTHROUGH;
  1378. default:
  1379. /* will never occur */
  1380. break;
  1381. }
  1382. cnv->charErrorBufferLength=(int8_t)length;
  1383. /* now output what fits into the regular target */
  1384. c>>=8*length; /* length was reduced by targetCapacity */
  1385. switch(targetCapacity) {
  1386. /* each branch falls through to the next one */
  1387. case 3:
  1388. *target++=(uint8_t)(c>>16);
  1389. if(offsets!=nullptr) {
  1390. *offsets++=sourceIndex;
  1391. }
  1392. U_FALLTHROUGH;
  1393. case 2:
  1394. *target++=(uint8_t)(c>>8);
  1395. if(offsets!=nullptr) {
  1396. *offsets++=sourceIndex;
  1397. }
  1398. U_FALLTHROUGH;
  1399. case 1:
  1400. *target++=(uint8_t)c;
  1401. if(offsets!=nullptr) {
  1402. *offsets++=sourceIndex;
  1403. }
  1404. U_FALLTHROUGH;
  1405. default:
  1406. break;
  1407. }
  1408. /* target overflow */
  1409. targetCapacity=0;
  1410. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1411. c=0;
  1412. goto endloop;
  1413. }
  1414. }
  1415. /*
  1416. * Identical to _SCSUFromUnicodeWithOffsets but without offset handling.
  1417. * If a change is made in the original function, then either
  1418. * change this function the same way or
  1419. * re-copy the original function and remove the variables
  1420. * offsets, sourceIndex, and nextSourceIndex.
  1421. */
  1422. static void U_CALLCONV
  1423. _SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs,
  1424. UErrorCode *pErrorCode) {
  1425. UConverter *cnv;
  1426. SCSUData *scsu;
  1427. const char16_t *source, *sourceLimit;
  1428. uint8_t *target;
  1429. int32_t targetCapacity;
  1430. UBool isSingleByteMode;
  1431. uint8_t dynamicWindow;
  1432. uint32_t currentOffset;
  1433. uint32_t c, delta;
  1434. int32_t length;
  1435. /* variables for compression heuristics */
  1436. uint32_t offset;
  1437. char16_t lead, trail;
  1438. int code;
  1439. int8_t window;
  1440. /* set up the local pointers */
  1441. cnv=pArgs->converter;
  1442. scsu=(SCSUData *)cnv->extraInfo;
  1443. /* set up the local pointers */
  1444. source=pArgs->source;
  1445. sourceLimit=pArgs->sourceLimit;
  1446. target=(uint8_t *)pArgs->target;
  1447. targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
  1448. /* get the state machine state */
  1449. isSingleByteMode=scsu->fromUIsSingleByteMode;
  1450. dynamicWindow=scsu->fromUDynamicWindow;
  1451. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1452. c=cnv->fromUChar32;
  1453. /* similar conversion "loop" as in toUnicode */
  1454. loop:
  1455. if(isSingleByteMode) {
  1456. if(c!=0 && targetCapacity>0) {
  1457. goto getTrailSingle;
  1458. }
  1459. /* state machine for single-byte mode */
  1460. /* singleByteMode: */
  1461. while(source<sourceLimit) {
  1462. if(targetCapacity<=0) {
  1463. /* target is full */
  1464. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1465. break;
  1466. }
  1467. c=*source++;
  1468. if((c-0x20)<=0x5f) {
  1469. /* pass US-ASCII graphic character through */
  1470. *target++=(uint8_t)c;
  1471. --targetCapacity;
  1472. } else if(c<0x20) {
  1473. if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) {
  1474. /* CR/LF/TAB/NUL */
  1475. *target++=(uint8_t)c;
  1476. --targetCapacity;
  1477. } else {
  1478. /* quote C0 control character */
  1479. c|=SQ0<<8;
  1480. length=2;
  1481. goto outputBytes;
  1482. }
  1483. } else if((delta=c-currentOffset)<=0x7f) {
  1484. /* use the current dynamic window */
  1485. *target++=(uint8_t)(delta|0x80);
  1486. --targetCapacity;
  1487. } else if(U16_IS_SURROGATE(c)) {
  1488. if(U16_IS_SURROGATE_LEAD(c)) {
  1489. getTrailSingle:
  1490. lead=(char16_t)c;
  1491. if(source<sourceLimit) {
  1492. /* test the following code unit */
  1493. trail=*source;
  1494. if(U16_IS_TRAIL(trail)) {
  1495. ++source;
  1496. c=U16_GET_SUPPLEMENTARY(c, trail);
  1497. /* convert this surrogate code point */
  1498. /* exit this condition tree */
  1499. } else {
  1500. /* this is an unmatched lead code unit (1st surrogate) */
  1501. /* callback(illegal) */
  1502. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1503. goto endloop;
  1504. }
  1505. } else {
  1506. /* no more input */
  1507. break;
  1508. }
  1509. } else {
  1510. /* this is an unmatched trail code unit (2nd surrogate) */
  1511. /* callback(illegal) */
  1512. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1513. goto endloop;
  1514. }
  1515. /* compress supplementary character U+10000..U+10ffff */
  1516. if((delta=c-currentOffset)<=0x7f) {
  1517. /* use the current dynamic window */
  1518. *target++=(uint8_t)(delta|0x80);
  1519. --targetCapacity;
  1520. } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1521. /* there is a dynamic window that contains this character, change to it */
  1522. dynamicWindow=window;
  1523. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1524. useDynamicWindow(scsu, dynamicWindow);
  1525. c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1526. length=2;
  1527. goto outputBytes;
  1528. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1529. /* might check if there are more characters in this window to come */
  1530. /* define an extended window with this character */
  1531. code-=0x200;
  1532. dynamicWindow=getNextDynamicWindow(scsu);
  1533. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1534. useDynamicWindow(scsu, dynamicWindow);
  1535. c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1536. length=4;
  1537. goto outputBytes;
  1538. } else {
  1539. /* change to Unicode mode and output this (lead, trail) pair */
  1540. isSingleByteMode=false;
  1541. *target++=(uint8_t)SCU;
  1542. --targetCapacity;
  1543. c=((uint32_t)lead<<16)|trail;
  1544. length=4;
  1545. goto outputBytes;
  1546. }
  1547. } else if(c<0xa0) {
  1548. /* quote C1 control character */
  1549. c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */
  1550. length=2;
  1551. goto outputBytes;
  1552. } else if(c==0xfeff || c>=0xfff0) {
  1553. /* quote signature character=byte order mark and specials */
  1554. c|=SQU<<16;
  1555. length=3;
  1556. goto outputBytes;
  1557. } else {
  1558. /* compress all other BMP characters */
  1559. if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1560. /* there is a window defined that contains this character - switch to it or quote from it? */
  1561. if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) {
  1562. /* change to dynamic window */
  1563. dynamicWindow=window;
  1564. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1565. useDynamicWindow(scsu, dynamicWindow);
  1566. c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1567. length=2;
  1568. goto outputBytes;
  1569. } else {
  1570. /* quote from dynamic window */
  1571. c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80;
  1572. length=2;
  1573. goto outputBytes;
  1574. }
  1575. } else if((window=getWindow(staticOffsets, c))>=0) {
  1576. /* quote from static window */
  1577. c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]);
  1578. length=2;
  1579. goto outputBytes;
  1580. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1581. /* define a dynamic window with this character */
  1582. dynamicWindow=getNextDynamicWindow(scsu);
  1583. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1584. useDynamicWindow(scsu, dynamicWindow);
  1585. c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1586. length=3;
  1587. goto outputBytes;
  1588. } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) &&
  1589. (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400))
  1590. ) {
  1591. /*
  1592. * this character is not compressible (a BMP ideograph or similar);
  1593. * switch to Unicode mode if this is the last character in the block
  1594. * or there is at least one more ideograph following immediately
  1595. */
  1596. isSingleByteMode=false;
  1597. c|=SCU<<16;
  1598. length=3;
  1599. goto outputBytes;
  1600. } else {
  1601. /* quote Unicode */
  1602. c|=SQU<<16;
  1603. length=3;
  1604. goto outputBytes;
  1605. }
  1606. }
  1607. /* normal end of conversion: prepare for a new character */
  1608. c=0;
  1609. }
  1610. } else {
  1611. if(c!=0 && targetCapacity>0) {
  1612. goto getTrailUnicode;
  1613. }
  1614. /* state machine for Unicode mode */
  1615. /* unicodeByteMode: */
  1616. while(source<sourceLimit) {
  1617. if(targetCapacity<=0) {
  1618. /* target is full */
  1619. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1620. break;
  1621. }
  1622. c=*source++;
  1623. if((uint32_t)(c-0x3400)<(0xd800-0x3400)) {
  1624. /* not compressible, write character directly */
  1625. if(targetCapacity>=2) {
  1626. *target++=(uint8_t)(c>>8);
  1627. *target++=(uint8_t)c;
  1628. targetCapacity-=2;
  1629. } else {
  1630. length=2;
  1631. goto outputBytes;
  1632. }
  1633. } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) {
  1634. /* compress BMP character if the following one is not an uncompressible ideograph */
  1635. if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) {
  1636. if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) {
  1637. /* ASCII digit or letter */
  1638. isSingleByteMode=true;
  1639. c|=((uint32_t)(UC0+dynamicWindow)<<8)|c;
  1640. length=2;
  1641. goto outputBytes;
  1642. } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) {
  1643. /* there is a dynamic window that contains this character, change to it */
  1644. isSingleByteMode=true;
  1645. dynamicWindow=window;
  1646. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1647. useDynamicWindow(scsu, dynamicWindow);
  1648. c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1649. length=2;
  1650. goto outputBytes;
  1651. } else if((code=getDynamicOffset(c, &offset))>=0) {
  1652. /* define a dynamic window with this character */
  1653. isSingleByteMode=true;
  1654. dynamicWindow=getNextDynamicWindow(scsu);
  1655. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1656. useDynamicWindow(scsu, dynamicWindow);
  1657. c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1658. length=3;
  1659. goto outputBytes;
  1660. }
  1661. }
  1662. /* don't know how to compress this character, just write it directly */
  1663. length=2;
  1664. goto outputBytes;
  1665. } else if(c<0xe000) {
  1666. /* c is a surrogate */
  1667. if(U16_IS_SURROGATE_LEAD(c)) {
  1668. getTrailUnicode:
  1669. lead=(char16_t)c;
  1670. if(source<sourceLimit) {
  1671. /* test the following code unit */
  1672. trail=*source;
  1673. if(U16_IS_TRAIL(trail)) {
  1674. ++source;
  1675. c=U16_GET_SUPPLEMENTARY(c, trail);
  1676. /* convert this surrogate code point */
  1677. /* exit this condition tree */
  1678. } else {
  1679. /* this is an unmatched lead code unit (1st surrogate) */
  1680. /* callback(illegal) */
  1681. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1682. goto endloop;
  1683. }
  1684. } else {
  1685. /* no more input */
  1686. break;
  1687. }
  1688. } else {
  1689. /* this is an unmatched trail code unit (2nd surrogate) */
  1690. /* callback(illegal) */
  1691. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1692. goto endloop;
  1693. }
  1694. /* compress supplementary character */
  1695. if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 &&
  1696. !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))
  1697. ) {
  1698. /*
  1699. * there is a dynamic window that contains this character and
  1700. * the following character is not uncompressible,
  1701. * change to the window
  1702. */
  1703. isSingleByteMode=true;
  1704. dynamicWindow=window;
  1705. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow];
  1706. useDynamicWindow(scsu, dynamicWindow);
  1707. c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80;
  1708. length=2;
  1709. goto outputBytes;
  1710. } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */
  1711. (code=getDynamicOffset(c, &offset))>=0
  1712. ) {
  1713. /* two supplementary characters in (probably) the same window - define an extended one */
  1714. isSingleByteMode=true;
  1715. code-=0x200;
  1716. dynamicWindow=getNextDynamicWindow(scsu);
  1717. currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset;
  1718. useDynamicWindow(scsu, dynamicWindow);
  1719. c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80;
  1720. length=4;
  1721. goto outputBytes;
  1722. } else {
  1723. /* don't know how to compress this character, just write it directly */
  1724. c=((uint32_t)lead<<16)|trail;
  1725. length=4;
  1726. goto outputBytes;
  1727. }
  1728. } else /* 0xe000<=c<0xf300 */ {
  1729. /* quote to avoid SCSU tags */
  1730. c|=UQU<<16;
  1731. length=3;
  1732. goto outputBytes;
  1733. }
  1734. /* normal end of conversion: prepare for a new character */
  1735. c=0;
  1736. }
  1737. }
  1738. endloop:
  1739. /* set the converter state back into UConverter */
  1740. scsu->fromUIsSingleByteMode=isSingleByteMode;
  1741. scsu->fromUDynamicWindow=dynamicWindow;
  1742. cnv->fromUChar32=c;
  1743. /* write back the updated pointers */
  1744. pArgs->source=source;
  1745. pArgs->target=(char *)target;
  1746. return;
  1747. outputBytes:
  1748. /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */
  1749. /* from the first if in the loop we know that targetCapacity>0 */
  1750. if(length<=targetCapacity) {
  1751. switch(length) {
  1752. /* each branch falls through to the next one */
  1753. case 4:
  1754. *target++=(uint8_t)(c>>24);
  1755. U_FALLTHROUGH;
  1756. case 3:
  1757. *target++=(uint8_t)(c>>16);
  1758. U_FALLTHROUGH;
  1759. case 2:
  1760. *target++=(uint8_t)(c>>8);
  1761. U_FALLTHROUGH;
  1762. case 1:
  1763. *target++=(uint8_t)c;
  1764. U_FALLTHROUGH;
  1765. default:
  1766. /* will never occur */
  1767. break;
  1768. }
  1769. targetCapacity-=length;
  1770. /* normal end of conversion: prepare for a new character */
  1771. c=0;
  1772. goto loop;
  1773. } else {
  1774. uint8_t *p;
  1775. /*
  1776. * We actually do this backwards here:
  1777. * In order to save an intermediate variable, we output
  1778. * first to the overflow buffer what does not fit into the
  1779. * regular target.
  1780. */
  1781. /* we know that 0<=targetCapacity<length<=4 */
  1782. /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */
  1783. length-=targetCapacity;
  1784. p=(uint8_t *)cnv->charErrorBuffer;
  1785. switch(length) {
  1786. /* each branch falls through to the next one */
  1787. case 4:
  1788. *p++=(uint8_t)(c>>24);
  1789. U_FALLTHROUGH;
  1790. case 3:
  1791. *p++=(uint8_t)(c>>16);
  1792. U_FALLTHROUGH;
  1793. case 2:
  1794. *p++=(uint8_t)(c>>8);
  1795. U_FALLTHROUGH;
  1796. case 1:
  1797. *p=(uint8_t)c;
  1798. U_FALLTHROUGH;
  1799. default:
  1800. /* will never occur */
  1801. break;
  1802. }
  1803. cnv->charErrorBufferLength=(int8_t)length;
  1804. /* now output what fits into the regular target */
  1805. c = (length == 4) ? 0 : c >> 8*length; /* length was reduced by targetCapacity */
  1806. switch(targetCapacity) {
  1807. /* each branch falls through to the next one */
  1808. case 3:
  1809. *target++=(uint8_t)(c>>16);
  1810. U_FALLTHROUGH;
  1811. case 2:
  1812. *target++=(uint8_t)(c>>8);
  1813. U_FALLTHROUGH;
  1814. case 1:
  1815. *target++=(uint8_t)c;
  1816. U_FALLTHROUGH;
  1817. default:
  1818. break;
  1819. }
  1820. /* target overflow */
  1821. targetCapacity=0;
  1822. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1823. c=0;
  1824. goto endloop;
  1825. }
  1826. }
  1827. /* miscellaneous ------------------------------------------------------------ */
  1828. static const char * U_CALLCONV
  1829. _SCSUGetName(const UConverter *cnv) {
  1830. SCSUData *scsu=(SCSUData *)cnv->extraInfo;
  1831. switch(scsu->locale) {
  1832. case l_ja:
  1833. return "SCSU,locale=ja";
  1834. default:
  1835. return "SCSU";
  1836. }
  1837. }
  1838. /* structure for SafeClone calculations */
  1839. struct cloneSCSUStruct
  1840. {
  1841. UConverter cnv;
  1842. SCSUData mydata;
  1843. };
  1844. static UConverter * U_CALLCONV
  1845. _SCSUSafeClone(const UConverter *cnv,
  1846. void *stackBuffer,
  1847. int32_t *pBufferSize,
  1848. UErrorCode *status)
  1849. {
  1850. struct cloneSCSUStruct * localClone;
  1851. int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct);
  1852. if (U_FAILURE(*status)){
  1853. return 0;
  1854. }
  1855. if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
  1856. *pBufferSize = bufferSizeNeeded;
  1857. return 0;
  1858. }
  1859. localClone = (struct cloneSCSUStruct *)stackBuffer;
  1860. /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
  1861. uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData));
  1862. localClone->cnv.extraInfo = &localClone->mydata;
  1863. localClone->cnv.isExtraLocal = true;
  1864. return &localClone->cnv;
  1865. }
  1866. U_CDECL_END
  1867. static const UConverterImpl _SCSUImpl={
  1868. UCNV_SCSU,
  1869. nullptr,
  1870. nullptr,
  1871. _SCSUOpen,
  1872. _SCSUClose,
  1873. _SCSUReset,
  1874. _SCSUToUnicode,
  1875. _SCSUToUnicodeWithOffsets,
  1876. _SCSUFromUnicode,
  1877. _SCSUFromUnicodeWithOffsets,
  1878. nullptr,
  1879. nullptr,
  1880. _SCSUGetName,
  1881. nullptr,
  1882. _SCSUSafeClone,
  1883. ucnv_getCompleteUnicodeSet,
  1884. nullptr,
  1885. nullptr
  1886. };
  1887. static const UConverterStaticData _SCSUStaticData={
  1888. sizeof(UConverterStaticData),
  1889. "SCSU",
  1890. 1212, /* CCSID for SCSU */
  1891. UCNV_IBM, UCNV_SCSU,
  1892. 1, 3, /* one char16_t generates at least 1 byte and at most 3 bytes */
  1893. /*
  1894. * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode
  1895. * substitution string.
  1896. */
  1897. { 0x0e, 0xff, 0xfd, 0 }, 3,
  1898. false, false,
  1899. 0,
  1900. 0,
  1901. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1902. };
  1903. const UConverterSharedData _SCSUData=
  1904. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl);
  1905. #endif