ucnv_u7.cpp 55 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2002-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. * file name: ucnv_u7.c
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2002jul01
  14. * created by: Markus W. Scherer
  15. *
  16. * UTF-7 converter implementation. Used to be in ucnv_utf.c.
  17. */
  18. #include "unicode/utypes.h"
  19. #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
  20. #include "cmemory.h"
  21. #include "unicode/ucnv.h"
  22. #include "ucnv_bld.h"
  23. #include "ucnv_cnv.h"
  24. #include "uassert.h"
  25. /* UTF-7 -------------------------------------------------------------------- */
  26. /*
  27. * UTF-7 is a stateful encoding of Unicode.
  28. * It is defined in RFC 2152. (http://www.ietf.org/rfc/rfc2152.txt)
  29. * It was intended for use in Internet email systems, using in its bytewise
  30. * encoding only a subset of 7-bit US-ASCII.
  31. * UTF-7 is deprecated in favor of UTF-8/16/32 and SCSU, but still
  32. * occasionally used.
  33. *
  34. * For converting Unicode to UTF-7, the RFC allows to encode some US-ASCII
  35. * characters directly or in base64. Especially, the characters in set O
  36. * as defined in the RFC (see below) may be encoded directly but are not
  37. * allowed in, e.g., email headers.
  38. * By default, the ICU UTF-7 converter encodes set O directly.
  39. * By choosing the option "version=1", set O will be escaped instead.
  40. * For example:
  41. * utf7Converter=ucnv_open("UTF-7,version=1");
  42. *
  43. * For details about email headers see RFC 2047.
  44. */
  45. /*
  46. * Tests for US-ASCII characters belonging to character classes
  47. * defined in UTF-7.
  48. *
  49. * Set D (directly encoded characters) consists of the following
  50. * characters: the upper and lower case letters A through Z
  51. * and a through z, the 10 digits 0-9, and the following nine special
  52. * characters (note that "+" and "=" are omitted):
  53. * '(),-./:?
  54. *
  55. * Set O (optional direct characters) consists of the following
  56. * characters (note that "\" and "~" are omitted):
  57. * !"#$%&*;<=>@[]^_`{|}
  58. *
  59. * According to the rules in RFC 2152, the byte values for the following
  60. * US-ASCII characters are not used in UTF-7 and are therefore illegal:
  61. * - all C0 control codes except for CR LF TAB
  62. * - BACKSLASH
  63. * - TILDE
  64. * - DEL
  65. * - all codes beyond US-ASCII, i.e. all >127
  66. */
  67. #define inSetD(c) \
  68. ((uint8_t)((c)-97)<26 || (uint8_t)((c)-65)<26 || /* letters */ \
  69. (uint8_t)((c)-48)<10 || /* digits */ \
  70. (uint8_t)((c)-39)<3 || /* '() */ \
  71. (uint8_t)((c)-44)<4 || /* ,-./ */ \
  72. (c)==58 || (c)==63 /* :? */ \
  73. )
  74. #define inSetO(c) \
  75. ((uint8_t)((c)-33)<6 || /* !"#$%& */ \
  76. (uint8_t)((c)-59)<4 || /* ;<=> */ \
  77. (uint8_t)((c)-93)<4 || /* ]^_` */ \
  78. (uint8_t)((c)-123)<3 || /* {|} */ \
  79. (c)==42 || (c)==64 || (c)==91 /* *@[ */ \
  80. )
  81. #define isCRLFTAB(c) ((c)==13 || (c)==10 || (c)==9)
  82. #define isCRLFSPTAB(c) ((c)==32 || (c)==13 || (c)==10 || (c)==9)
  83. #define PLUS 43
  84. #define MINUS 45
  85. #define BACKSLASH 92
  86. #define TILDE 126
  87. /* legal byte values: all US-ASCII graphic characters from space to before tilde, and CR LF TAB */
  88. #define isLegalUTF7(c) (((uint8_t)((c)-32)<94 && (c)!=BACKSLASH) || isCRLFTAB(c))
  89. /* encode directly sets D and O and CR LF SP TAB */
  90. static const UBool encodeDirectlyMaximum[128]={
  91. /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  92. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
  93. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  94. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
  95. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  96. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  97. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
  98. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  99. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0
  100. };
  101. /* encode directly set D and CR LF SP TAB but not set O */
  102. static const UBool encodeDirectlyRestricted[128]={
  103. /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
  104. 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
  105. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  106. 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
  107. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
  108. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  109. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
  110. 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  111. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
  112. };
  113. static const uint8_t
  114. toBase64[64]={
  115. /* A-Z */
  116. 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
  117. 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
  118. /* a-z */
  119. 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
  120. 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
  121. /* 0-9 */
  122. 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
  123. /* +/ */
  124. 43, 47
  125. };
  126. static const int8_t
  127. fromBase64[128]={
  128. /* C0 controls, -1 for legal ones (CR LF TAB), -3 for illegal ones */
  129. -3, -3, -3, -3, -3, -3, -3, -3, -3, -1, -1, -3, -3, -1, -3, -3,
  130. -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
  131. /* general punctuation with + and / and a special value (-2) for - */
  132. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -2, -1, 63,
  133. /* digits */
  134. 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
  135. /* A-Z */
  136. -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
  137. 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -3, -1, -1, -1,
  138. /* a-z */
  139. -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
  140. 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -3, -3
  141. };
  142. /*
  143. * converter status values:
  144. *
  145. * toUnicodeStatus:
  146. * 24 inDirectMode (boolean)
  147. * 23..16 base64Counter (-1..7)
  148. * 15..0 bits (up to 14 bits incoming base64)
  149. *
  150. * fromUnicodeStatus:
  151. * 31..28 version (0: set O direct 1: set O escaped)
  152. * 24 inDirectMode (boolean)
  153. * 23..16 base64Counter (0..2)
  154. * 7..0 bits (6 bits outgoing base64)
  155. *
  156. */
  157. U_CDECL_BEGIN
  158. static void U_CALLCONV
  159. _UTF7Reset(UConverter *cnv, UConverterResetChoice choice) {
  160. if(choice<=UCNV_RESET_TO_UNICODE) {
  161. /* reset toUnicode */
  162. cnv->toUnicodeStatus=0x1000000; /* inDirectMode=true */
  163. cnv->toULength=0;
  164. }
  165. if(choice!=UCNV_RESET_TO_UNICODE) {
  166. /* reset fromUnicode */
  167. cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
  168. }
  169. }
  170. static void U_CALLCONV
  171. _UTF7Open(UConverter *cnv,
  172. UConverterLoadArgs *pArgs,
  173. UErrorCode *pErrorCode) {
  174. (void)pArgs;
  175. if(UCNV_GET_VERSION(cnv)<=1) {
  176. /* TODO(markus): Should just use cnv->options rather than copying the version number. */
  177. cnv->fromUnicodeStatus=UCNV_GET_VERSION(cnv)<<28;
  178. _UTF7Reset(cnv, UCNV_RESET_BOTH);
  179. } else {
  180. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  181. }
  182. }
  183. static void U_CALLCONV
  184. _UTF7ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  185. UErrorCode *pErrorCode) {
  186. UConverter *cnv;
  187. const uint8_t *source, *sourceLimit;
  188. char16_t *target;
  189. const char16_t *targetLimit;
  190. int32_t *offsets;
  191. uint8_t *bytes;
  192. uint8_t byteIndex;
  193. int32_t length, targetCapacity;
  194. /* UTF-7 state */
  195. uint16_t bits;
  196. int8_t base64Counter;
  197. UBool inDirectMode;
  198. int8_t base64Value;
  199. int32_t sourceIndex, nextSourceIndex;
  200. uint8_t b;
  201. /* set up the local pointers */
  202. cnv=pArgs->converter;
  203. source=(const uint8_t *)pArgs->source;
  204. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  205. target=pArgs->target;
  206. targetLimit=pArgs->targetLimit;
  207. offsets=pArgs->offsets;
  208. /* get the state machine state */
  209. {
  210. uint32_t status=cnv->toUnicodeStatus;
  211. inDirectMode=(UBool)((status>>24)&1);
  212. base64Counter=(int8_t)(status>>16);
  213. bits=(uint16_t)status;
  214. }
  215. bytes=cnv->toUBytes;
  216. byteIndex=cnv->toULength;
  217. /* sourceIndex=-1 if the current character began in the previous buffer */
  218. sourceIndex=byteIndex==0 ? 0 : -1;
  219. nextSourceIndex=0;
  220. if(inDirectMode) {
  221. directMode:
  222. /*
  223. * In Direct Mode, most US-ASCII characters are encoded directly, i.e.,
  224. * with their US-ASCII byte values.
  225. * Backslash and Tilde and most control characters are not allowed in UTF-7.
  226. * A plus sign starts Unicode (or "escape") Mode.
  227. *
  228. * In Direct Mode, only the sourceIndex is used.
  229. */
  230. byteIndex=0;
  231. length=(int32_t)(sourceLimit-source);
  232. targetCapacity=(int32_t)(targetLimit-target);
  233. if(length>targetCapacity) {
  234. length=targetCapacity;
  235. }
  236. while(length>0) {
  237. b=*source++;
  238. if(!isLegalUTF7(b)) {
  239. /* illegal */
  240. bytes[0]=b;
  241. byteIndex=1;
  242. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  243. break;
  244. } else if(b!=PLUS) {
  245. /* write directly encoded character */
  246. *target++=b;
  247. if(offsets!=nullptr) {
  248. *offsets++=sourceIndex++;
  249. }
  250. } else /* PLUS */ {
  251. /* switch to Unicode mode */
  252. nextSourceIndex=++sourceIndex;
  253. inDirectMode=false;
  254. byteIndex=0;
  255. bits=0;
  256. base64Counter=-1;
  257. goto unicodeMode;
  258. }
  259. --length;
  260. }
  261. if(source<sourceLimit && target>=targetLimit) {
  262. /* target is full */
  263. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  264. }
  265. } else {
  266. unicodeMode:
  267. /*
  268. * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
  269. * The base64 sequence ends with any character that is not in the base64 alphabet.
  270. * A terminating minus sign is consumed.
  271. *
  272. * In Unicode Mode, the sourceIndex has the index to the start of the current
  273. * base64 bytes, while nextSourceIndex is precisely parallel to source,
  274. * keeping the index to the following byte.
  275. * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
  276. */
  277. while(source<sourceLimit) {
  278. if(target<targetLimit) {
  279. bytes[byteIndex++]=b=*source++;
  280. ++nextSourceIndex;
  281. base64Value = -3; /* initialize as illegal */
  282. if(b>=126 || (base64Value=fromBase64[b])==-3 || base64Value==-1) {
  283. /* either
  284. * base64Value==-1 for any legal character except base64 and minus sign, or
  285. * base64Value==-3 for illegal characters:
  286. * 1. In either case, leave Unicode mode.
  287. * 2.1. If we ended with an incomplete char16_t or none after the +, then
  288. * generate an error for the preceding erroneous sequence and deal with
  289. * the current (possibly illegal) character next time through.
  290. * 2.2. Else the current char comes after a complete char16_t, which was already
  291. * pushed to the output buf, so:
  292. * 2.2.1. If the current char is legal, just save it for processing next time.
  293. * It may be for example, a plus which we need to deal with in direct mode.
  294. * 2.2.2. Else if the current char is illegal, we might as well deal with it here.
  295. */
  296. inDirectMode=true;
  297. if(base64Counter==-1) {
  298. /* illegal: + immediately followed by something other than base64 or minus sign */
  299. /* include the plus sign in the reported sequence, but not the subsequent char */
  300. --source;
  301. bytes[0]=PLUS;
  302. byteIndex=1;
  303. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  304. break;
  305. } else if(bits!=0) {
  306. /* bits are illegally left over, a char16_t is incomplete */
  307. /* don't include current char (legal or illegal) in error seq */
  308. --source;
  309. --byteIndex;
  310. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  311. break;
  312. } else {
  313. /* previous char16_t was complete */
  314. if(base64Value==-3) {
  315. /* current character is illegal, deal with it here */
  316. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  317. break;
  318. } else {
  319. /* un-read the current character in case it is a plus sign */
  320. --source;
  321. sourceIndex=nextSourceIndex-1;
  322. goto directMode;
  323. }
  324. }
  325. } else if(base64Value>=0) {
  326. /* collect base64 bytes into UChars */
  327. switch(base64Counter) {
  328. case -1: /* -1 is immediately after the + */
  329. case 0:
  330. bits=base64Value;
  331. base64Counter=1;
  332. break;
  333. case 1:
  334. case 3:
  335. case 4:
  336. case 6:
  337. bits=(uint16_t)((bits<<6)|base64Value);
  338. ++base64Counter;
  339. break;
  340. case 2:
  341. *target++=(char16_t)((bits<<4)|(base64Value>>2));
  342. if(offsets!=nullptr) {
  343. *offsets++=sourceIndex;
  344. sourceIndex=nextSourceIndex-1;
  345. }
  346. bytes[0]=b; /* keep this byte in case an error occurs */
  347. byteIndex=1;
  348. bits=(uint16_t)(base64Value&3);
  349. base64Counter=3;
  350. break;
  351. case 5:
  352. *target++=(char16_t)((bits<<2)|(base64Value>>4));
  353. if(offsets!=nullptr) {
  354. *offsets++=sourceIndex;
  355. sourceIndex=nextSourceIndex-1;
  356. }
  357. bytes[0]=b; /* keep this byte in case an error occurs */
  358. byteIndex=1;
  359. bits=(uint16_t)(base64Value&15);
  360. base64Counter=6;
  361. break;
  362. case 7:
  363. *target++=(char16_t)((bits<<6)|base64Value);
  364. if(offsets!=nullptr) {
  365. *offsets++=sourceIndex;
  366. sourceIndex=nextSourceIndex;
  367. }
  368. byteIndex=0;
  369. bits=0;
  370. base64Counter=0;
  371. break;
  372. default:
  373. /* will never occur */
  374. break;
  375. }
  376. } else /*base64Value==-2*/ {
  377. /* minus sign terminates the base64 sequence */
  378. inDirectMode=true;
  379. if(base64Counter==-1) {
  380. /* +- i.e. a minus immediately following a plus */
  381. *target++=PLUS;
  382. if(offsets!=nullptr) {
  383. *offsets++=sourceIndex-1;
  384. }
  385. } else {
  386. /* absorb the minus and leave the Unicode Mode */
  387. if(bits!=0) {
  388. /* bits are illegally left over, a char16_t is incomplete */
  389. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  390. break;
  391. }
  392. }
  393. sourceIndex=nextSourceIndex;
  394. goto directMode;
  395. }
  396. } else {
  397. /* target is full */
  398. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  399. break;
  400. }
  401. }
  402. }
  403. if(U_SUCCESS(*pErrorCode) && pArgs->flush && source==sourceLimit && bits==0) {
  404. /*
  405. * if we are in Unicode mode, then the byteIndex might not be 0,
  406. * but that is ok if bits==0
  407. * -> we set byteIndex=0 at the end of the stream to avoid a truncated error
  408. * (not true for IMAP-mailbox-name where we must end in direct mode)
  409. */
  410. byteIndex=0;
  411. }
  412. /* set the converter state back into UConverter */
  413. cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
  414. cnv->toULength=byteIndex;
  415. /* write back the updated pointers */
  416. pArgs->source=(const char *)source;
  417. pArgs->target=target;
  418. pArgs->offsets=offsets;
  419. return;
  420. }
  421. static void U_CALLCONV
  422. _UTF7FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  423. UErrorCode *pErrorCode) {
  424. UConverter *cnv;
  425. const char16_t *source, *sourceLimit;
  426. uint8_t *target, *targetLimit;
  427. int32_t *offsets;
  428. int32_t length, targetCapacity, sourceIndex;
  429. char16_t c;
  430. /* UTF-7 state */
  431. const UBool *encodeDirectly;
  432. uint8_t bits;
  433. int8_t base64Counter;
  434. UBool inDirectMode;
  435. /* set up the local pointers */
  436. cnv=pArgs->converter;
  437. /* set up the local pointers */
  438. source=pArgs->source;
  439. sourceLimit=pArgs->sourceLimit;
  440. target=(uint8_t *)pArgs->target;
  441. targetLimit=(uint8_t *)pArgs->targetLimit;
  442. offsets=pArgs->offsets;
  443. /* get the state machine state */
  444. {
  445. uint32_t status=cnv->fromUnicodeStatus;
  446. encodeDirectly= status<0x10000000 ? encodeDirectlyMaximum : encodeDirectlyRestricted;
  447. inDirectMode=(UBool)((status>>24)&1);
  448. base64Counter=(int8_t)(status>>16);
  449. bits=(uint8_t)status;
  450. U_ASSERT(bits<=UPRV_LENGTHOF(toBase64));
  451. }
  452. /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
  453. sourceIndex=0;
  454. if(inDirectMode) {
  455. directMode:
  456. length=(int32_t)(sourceLimit-source);
  457. targetCapacity=(int32_t)(targetLimit-target);
  458. if(length>targetCapacity) {
  459. length=targetCapacity;
  460. }
  461. while(length>0) {
  462. c=*source++;
  463. /* currently always encode CR LF SP TAB directly */
  464. if(c<=127 && encodeDirectly[c]) {
  465. /* encode directly */
  466. *target++=(uint8_t)c;
  467. if(offsets!=nullptr) {
  468. *offsets++=sourceIndex++;
  469. }
  470. } else if(c==PLUS) {
  471. /* output +- for + */
  472. *target++=PLUS;
  473. if(target<targetLimit) {
  474. *target++=MINUS;
  475. if(offsets!=nullptr) {
  476. *offsets++=sourceIndex;
  477. *offsets++=sourceIndex++;
  478. }
  479. /* realign length and targetCapacity */
  480. goto directMode;
  481. } else {
  482. if(offsets!=nullptr) {
  483. *offsets++=sourceIndex++;
  484. }
  485. cnv->charErrorBuffer[0]=MINUS;
  486. cnv->charErrorBufferLength=1;
  487. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  488. break;
  489. }
  490. } else {
  491. /* un-read this character and switch to Unicode Mode */
  492. --source;
  493. *target++=PLUS;
  494. if(offsets!=nullptr) {
  495. *offsets++=sourceIndex;
  496. }
  497. inDirectMode=false;
  498. base64Counter=0;
  499. goto unicodeMode;
  500. }
  501. --length;
  502. }
  503. if(source<sourceLimit && target>=targetLimit) {
  504. /* target is full */
  505. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  506. }
  507. } else {
  508. unicodeMode:
  509. while(source<sourceLimit) {
  510. if(target<targetLimit) {
  511. c=*source++;
  512. if(c<=127 && encodeDirectly[c]) {
  513. /* encode directly */
  514. inDirectMode=true;
  515. /* trick: back out this character to make this easier */
  516. --source;
  517. /* terminate the base64 sequence */
  518. if(base64Counter!=0) {
  519. /* write remaining bits for the previous character */
  520. *target++=toBase64[bits];
  521. if(offsets!=nullptr) {
  522. *offsets++=sourceIndex-1;
  523. }
  524. }
  525. if(fromBase64[c]!=-1) {
  526. /* need to terminate with a minus */
  527. if(target<targetLimit) {
  528. *target++=MINUS;
  529. if(offsets!=nullptr) {
  530. *offsets++=sourceIndex-1;
  531. }
  532. } else {
  533. cnv->charErrorBuffer[0]=MINUS;
  534. cnv->charErrorBufferLength=1;
  535. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  536. break;
  537. }
  538. }
  539. goto directMode;
  540. } else {
  541. /*
  542. * base64 this character:
  543. * Output 2 or 3 base64 bytes for the remaining bits of the previous character
  544. * and the bits of this character, each implicitly in UTF-16BE.
  545. *
  546. * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
  547. * character to the next. The actual 2 or 4 bits are shifted to the left edge
  548. * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
  549. */
  550. switch(base64Counter) {
  551. case 0:
  552. *target++=toBase64[c>>10];
  553. if(target<targetLimit) {
  554. *target++=toBase64[(c>>4)&0x3f];
  555. if(offsets!=nullptr) {
  556. *offsets++=sourceIndex;
  557. *offsets++=sourceIndex++;
  558. }
  559. } else {
  560. if(offsets!=nullptr) {
  561. *offsets++=sourceIndex++;
  562. }
  563. cnv->charErrorBuffer[0]=toBase64[(c>>4)&0x3f];
  564. cnv->charErrorBufferLength=1;
  565. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  566. }
  567. bits=(uint8_t)((c&15)<<2);
  568. base64Counter=1;
  569. break;
  570. case 1:
  571. *target++=toBase64[bits|(c>>14)];
  572. if(target<targetLimit) {
  573. *target++=toBase64[(c>>8)&0x3f];
  574. if(target<targetLimit) {
  575. *target++=toBase64[(c>>2)&0x3f];
  576. if(offsets!=nullptr) {
  577. *offsets++=sourceIndex;
  578. *offsets++=sourceIndex;
  579. *offsets++=sourceIndex++;
  580. }
  581. } else {
  582. if(offsets!=nullptr) {
  583. *offsets++=sourceIndex;
  584. *offsets++=sourceIndex++;
  585. }
  586. cnv->charErrorBuffer[0]=toBase64[(c>>2)&0x3f];
  587. cnv->charErrorBufferLength=1;
  588. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  589. }
  590. } else {
  591. if(offsets!=nullptr) {
  592. *offsets++=sourceIndex++;
  593. }
  594. cnv->charErrorBuffer[0]=toBase64[(c>>8)&0x3f];
  595. cnv->charErrorBuffer[1]=toBase64[(c>>2)&0x3f];
  596. cnv->charErrorBufferLength=2;
  597. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  598. }
  599. bits=(uint8_t)((c&3)<<4);
  600. base64Counter=2;
  601. break;
  602. case 2:
  603. *target++=toBase64[bits|(c>>12)];
  604. if(target<targetLimit) {
  605. *target++=toBase64[(c>>6)&0x3f];
  606. if(target<targetLimit) {
  607. *target++=toBase64[c&0x3f];
  608. if(offsets!=nullptr) {
  609. *offsets++=sourceIndex;
  610. *offsets++=sourceIndex;
  611. *offsets++=sourceIndex++;
  612. }
  613. } else {
  614. if(offsets!=nullptr) {
  615. *offsets++=sourceIndex;
  616. *offsets++=sourceIndex++;
  617. }
  618. cnv->charErrorBuffer[0]=toBase64[c&0x3f];
  619. cnv->charErrorBufferLength=1;
  620. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  621. }
  622. } else {
  623. if(offsets!=nullptr) {
  624. *offsets++=sourceIndex++;
  625. }
  626. cnv->charErrorBuffer[0]=toBase64[(c>>6)&0x3f];
  627. cnv->charErrorBuffer[1]=toBase64[c&0x3f];
  628. cnv->charErrorBufferLength=2;
  629. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  630. }
  631. bits=0;
  632. base64Counter=0;
  633. break;
  634. default:
  635. /* will never occur */
  636. break;
  637. }
  638. }
  639. } else {
  640. /* target is full */
  641. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  642. break;
  643. }
  644. }
  645. }
  646. if(pArgs->flush && source>=sourceLimit) {
  647. /* flush remaining bits to the target */
  648. if(!inDirectMode) {
  649. if (base64Counter!=0) {
  650. if(target<targetLimit) {
  651. *target++=toBase64[bits];
  652. if(offsets!=nullptr) {
  653. *offsets++=sourceIndex-1;
  654. }
  655. } else {
  656. cnv->charErrorBuffer[cnv->charErrorBufferLength++]=toBase64[bits];
  657. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  658. }
  659. }
  660. /* Add final MINUS to terminate unicodeMode */
  661. if(target<targetLimit) {
  662. *target++=MINUS;
  663. if(offsets!=nullptr) {
  664. *offsets++=sourceIndex-1;
  665. }
  666. } else {
  667. cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
  668. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  669. }
  670. }
  671. /* reset the state for the next conversion */
  672. cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
  673. } else {
  674. /* set the converter state back into UConverter */
  675. cnv->fromUnicodeStatus=
  676. (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
  677. ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
  678. }
  679. /* write back the updated pointers */
  680. pArgs->source=source;
  681. pArgs->target=(char *)target;
  682. pArgs->offsets=offsets;
  683. return;
  684. }
  685. static const char * U_CALLCONV
  686. _UTF7GetName(const UConverter *cnv) {
  687. switch(cnv->fromUnicodeStatus>>28) {
  688. case 1:
  689. return "UTF-7,version=1";
  690. default:
  691. return "UTF-7";
  692. }
  693. }
  694. U_CDECL_END
  695. static const UConverterImpl _UTF7Impl={
  696. UCNV_UTF7,
  697. nullptr,
  698. nullptr,
  699. _UTF7Open,
  700. nullptr,
  701. _UTF7Reset,
  702. _UTF7ToUnicodeWithOffsets,
  703. _UTF7ToUnicodeWithOffsets,
  704. _UTF7FromUnicodeWithOffsets,
  705. _UTF7FromUnicodeWithOffsets,
  706. nullptr,
  707. nullptr,
  708. _UTF7GetName,
  709. nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
  710. nullptr,
  711. ucnv_getCompleteUnicodeSet,
  712. nullptr,
  713. nullptr
  714. };
  715. static const UConverterStaticData _UTF7StaticData={
  716. sizeof(UConverterStaticData),
  717. "UTF-7",
  718. 0, /* TODO CCSID for UTF-7 */
  719. UCNV_IBM, UCNV_UTF7,
  720. 1, 4,
  721. { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
  722. false, false,
  723. 0,
  724. 0,
  725. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  726. };
  727. const UConverterSharedData _UTF7Data=
  728. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF7StaticData, &_UTF7Impl);
  729. /* IMAP mailbox name encoding ----------------------------------------------- */
  730. /*
  731. * RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1
  732. * http://www.ietf.org/rfc/rfc2060.txt
  733. *
  734. * 5.1.3. Mailbox International Naming Convention
  735. *
  736. * By convention, international mailbox names are specified using a
  737. * modified version of the UTF-7 encoding described in [UTF-7]. The
  738. * purpose of these modifications is to correct the following problems
  739. * with UTF-7:
  740. *
  741. * 1) UTF-7 uses the "+" character for shifting; this conflicts with
  742. * the common use of "+" in mailbox names, in particular USENET
  743. * newsgroup names.
  744. *
  745. * 2) UTF-7's encoding is BASE64 which uses the "/" character; this
  746. * conflicts with the use of "/" as a popular hierarchy delimiter.
  747. *
  748. * 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
  749. * the use of "\" as a popular hierarchy delimiter.
  750. *
  751. * 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
  752. * the use of "~" in some servers as a home directory indicator.
  753. *
  754. * 5) UTF-7 permits multiple alternate forms to represent the same
  755. * string; in particular, printable US-ASCII characters can be
  756. * represented in encoded form.
  757. *
  758. * In modified UTF-7, printable US-ASCII characters except for "&"
  759. * represent themselves; that is, characters with octet values 0x20-0x25
  760. * and 0x27-0x7e. The character "&" (0x26) is represented by the two-
  761. * octet sequence "&-".
  762. *
  763. * All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
  764. * Unicode 16-bit octets) are represented in modified BASE64, with a
  765. * further modification from [UTF-7] that "," is used instead of "/".
  766. * Modified BASE64 MUST NOT be used to represent any printing US-ASCII
  767. * character which can represent itself.
  768. *
  769. * "&" is used to shift to modified BASE64 and "-" to shift back to US-
  770. * ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that
  771. * is, a name that ends with a Unicode 16-bit octet MUST end with a "-
  772. * ").
  773. *
  774. * For example, here is a mailbox name which mixes English, Japanese,
  775. * and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
  776. */
  777. /*
  778. * Tests for US-ASCII characters belonging to character classes
  779. * defined in UTF-7.
  780. *
  781. * Set D (directly encoded characters) consists of the following
  782. * characters: the upper and lower case letters A through Z
  783. * and a through z, the 10 digits 0-9, and the following nine special
  784. * characters (note that "+" and "=" are omitted):
  785. * '(),-./:?
  786. *
  787. * Set O (optional direct characters) consists of the following
  788. * characters (note that "\" and "~" are omitted):
  789. * !"#$%&*;<=>@[]^_`{|}
  790. *
  791. * According to the rules in RFC 2152, the byte values for the following
  792. * US-ASCII characters are not used in UTF-7 and are therefore illegal:
  793. * - all C0 control codes except for CR LF TAB
  794. * - BACKSLASH
  795. * - TILDE
  796. * - DEL
  797. * - all codes beyond US-ASCII, i.e. all >127
  798. */
  799. /* uses '&' not '+' to start a base64 sequence */
  800. #define AMPERSAND 0x26
  801. #define COMMA 0x2c
  802. #define SLASH 0x2f
  803. /* legal byte values: all US-ASCII graphic characters 0x20..0x7e */
  804. #define isLegalIMAP(c) (0x20<=(c) && (c)<=0x7e)
  805. /* direct-encode all of printable ASCII 0x20..0x7e except '&' 0x26 */
  806. #define inSetDIMAP(c) (isLegalIMAP(c) && c!=AMPERSAND)
  807. #define TO_BASE64_IMAP(n) ((n)<63 ? toBase64[n] : COMMA)
  808. #define FROM_BASE64_IMAP(c) ((c)==COMMA ? 63 : (c)==SLASH ? -1 : fromBase64[c])
  809. /*
  810. * converter status values:
  811. *
  812. * toUnicodeStatus:
  813. * 24 inDirectMode (boolean)
  814. * 23..16 base64Counter (-1..7)
  815. * 15..0 bits (up to 14 bits incoming base64)
  816. *
  817. * fromUnicodeStatus:
  818. * 24 inDirectMode (boolean)
  819. * 23..16 base64Counter (0..2)
  820. * 7..0 bits (6 bits outgoing base64)
  821. *
  822. * ignore bits 31..25
  823. */
  824. U_CDECL_BEGIN
  825. static void U_CALLCONV
  826. _IMAPToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  827. UErrorCode *pErrorCode) {
  828. UConverter *cnv;
  829. const uint8_t *source, *sourceLimit;
  830. char16_t *target;
  831. const char16_t *targetLimit;
  832. int32_t *offsets;
  833. uint8_t *bytes;
  834. uint8_t byteIndex;
  835. int32_t length, targetCapacity;
  836. /* UTF-7 state */
  837. uint16_t bits;
  838. int8_t base64Counter;
  839. UBool inDirectMode;
  840. int8_t base64Value;
  841. int32_t sourceIndex, nextSourceIndex;
  842. char16_t c;
  843. uint8_t b;
  844. /* set up the local pointers */
  845. cnv=pArgs->converter;
  846. source=(const uint8_t *)pArgs->source;
  847. sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  848. target=pArgs->target;
  849. targetLimit=pArgs->targetLimit;
  850. offsets=pArgs->offsets;
  851. /* get the state machine state */
  852. {
  853. uint32_t status=cnv->toUnicodeStatus;
  854. inDirectMode=(UBool)((status>>24)&1);
  855. base64Counter=(int8_t)(status>>16);
  856. bits=(uint16_t)status;
  857. }
  858. bytes=cnv->toUBytes;
  859. byteIndex=cnv->toULength;
  860. /* sourceIndex=-1 if the current character began in the previous buffer */
  861. sourceIndex=byteIndex==0 ? 0 : -1;
  862. nextSourceIndex=0;
  863. if(inDirectMode) {
  864. directMode:
  865. /*
  866. * In Direct Mode, US-ASCII characters are encoded directly, i.e.,
  867. * with their US-ASCII byte values.
  868. * An ampersand starts Unicode (or "escape") Mode.
  869. *
  870. * In Direct Mode, only the sourceIndex is used.
  871. */
  872. byteIndex=0;
  873. length=(int32_t)(sourceLimit-source);
  874. targetCapacity=(int32_t)(targetLimit-target);
  875. if(length>targetCapacity) {
  876. length=targetCapacity;
  877. }
  878. while(length>0) {
  879. b=*source++;
  880. if(!isLegalIMAP(b)) {
  881. /* illegal */
  882. bytes[0]=b;
  883. byteIndex=1;
  884. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  885. break;
  886. } else if(b!=AMPERSAND) {
  887. /* write directly encoded character */
  888. *target++=b;
  889. if(offsets!=nullptr) {
  890. *offsets++=sourceIndex++;
  891. }
  892. } else /* AMPERSAND */ {
  893. /* switch to Unicode mode */
  894. nextSourceIndex=++sourceIndex;
  895. inDirectMode=false;
  896. byteIndex=0;
  897. bits=0;
  898. base64Counter=-1;
  899. goto unicodeMode;
  900. }
  901. --length;
  902. }
  903. if(source<sourceLimit && target>=targetLimit) {
  904. /* target is full */
  905. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  906. }
  907. } else {
  908. unicodeMode:
  909. /*
  910. * In Unicode (or "escape") Mode, UTF-16BE is base64-encoded.
  911. * The base64 sequence ends with any character that is not in the base64 alphabet.
  912. * A terminating minus sign is consumed.
  913. * US-ASCII must not be base64-ed.
  914. *
  915. * In Unicode Mode, the sourceIndex has the index to the start of the current
  916. * base64 bytes, while nextSourceIndex is precisely parallel to source,
  917. * keeping the index to the following byte.
  918. * Note that in 2 out of 3 cases, UChars overlap within a base64 byte.
  919. */
  920. while(source<sourceLimit) {
  921. if(target<targetLimit) {
  922. bytes[byteIndex++]=b=*source++;
  923. ++nextSourceIndex;
  924. if(b>0x7e) {
  925. /* illegal - test other illegal US-ASCII values by base64Value==-3 */
  926. inDirectMode=true;
  927. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  928. break;
  929. } else if((base64Value=FROM_BASE64_IMAP(b))>=0) {
  930. /* collect base64 bytes into UChars */
  931. switch(base64Counter) {
  932. case -1: /* -1 is immediately after the & */
  933. case 0:
  934. bits=base64Value;
  935. base64Counter=1;
  936. break;
  937. case 1:
  938. case 3:
  939. case 4:
  940. case 6:
  941. bits=(uint16_t)((bits<<6)|base64Value);
  942. ++base64Counter;
  943. break;
  944. case 2:
  945. c=(char16_t)((bits<<4)|(base64Value>>2));
  946. if(isLegalIMAP(c)) {
  947. /* illegal */
  948. inDirectMode=true;
  949. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  950. goto endloop;
  951. }
  952. *target++=c;
  953. if(offsets!=nullptr) {
  954. *offsets++=sourceIndex;
  955. sourceIndex=nextSourceIndex-1;
  956. }
  957. bytes[0]=b; /* keep this byte in case an error occurs */
  958. byteIndex=1;
  959. bits=(uint16_t)(base64Value&3);
  960. base64Counter=3;
  961. break;
  962. case 5:
  963. c=(char16_t)((bits<<2)|(base64Value>>4));
  964. if(isLegalIMAP(c)) {
  965. /* illegal */
  966. inDirectMode=true;
  967. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  968. goto endloop;
  969. }
  970. *target++=c;
  971. if(offsets!=nullptr) {
  972. *offsets++=sourceIndex;
  973. sourceIndex=nextSourceIndex-1;
  974. }
  975. bytes[0]=b; /* keep this byte in case an error occurs */
  976. byteIndex=1;
  977. bits=(uint16_t)(base64Value&15);
  978. base64Counter=6;
  979. break;
  980. case 7:
  981. c=(char16_t)((bits<<6)|base64Value);
  982. if(isLegalIMAP(c)) {
  983. /* illegal */
  984. inDirectMode=true;
  985. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  986. goto endloop;
  987. }
  988. *target++=c;
  989. if(offsets!=nullptr) {
  990. *offsets++=sourceIndex;
  991. sourceIndex=nextSourceIndex;
  992. }
  993. byteIndex=0;
  994. bits=0;
  995. base64Counter=0;
  996. break;
  997. default:
  998. /* will never occur */
  999. break;
  1000. }
  1001. } else if(base64Value==-2) {
  1002. /* minus sign terminates the base64 sequence */
  1003. inDirectMode=true;
  1004. if(base64Counter==-1) {
  1005. /* &- i.e. a minus immediately following an ampersand */
  1006. *target++=AMPERSAND;
  1007. if(offsets!=nullptr) {
  1008. *offsets++=sourceIndex-1;
  1009. }
  1010. } else {
  1011. /* absorb the minus and leave the Unicode Mode */
  1012. if(bits!=0 || (base64Counter!=0 && base64Counter!=3 && base64Counter!=6)) {
  1013. /* bits are illegally left over, a char16_t is incomplete */
  1014. /* base64Counter other than 0, 3, 6 means non-minimal zero-padding, also illegal */
  1015. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1016. break;
  1017. }
  1018. }
  1019. sourceIndex=nextSourceIndex;
  1020. goto directMode;
  1021. } else {
  1022. if(base64Counter==-1) {
  1023. /* illegal: & immediately followed by something other than base64 or minus sign */
  1024. /* include the ampersand in the reported sequence */
  1025. --sourceIndex;
  1026. bytes[0]=AMPERSAND;
  1027. bytes[1]=b;
  1028. byteIndex=2;
  1029. }
  1030. /* base64Value==-1 for characters that are illegal only in Unicode mode */
  1031. /* base64Value==-3 for illegal characters */
  1032. /* illegal */
  1033. inDirectMode=true;
  1034. *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1035. break;
  1036. }
  1037. } else {
  1038. /* target is full */
  1039. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1040. break;
  1041. }
  1042. }
  1043. }
  1044. endloop:
  1045. /*
  1046. * the end of the input stream and detection of truncated input
  1047. * are handled by the framework, but here we must check if we are in Unicode
  1048. * mode and byteIndex==0 because we must end in direct mode
  1049. *
  1050. * conditions:
  1051. * successful
  1052. * in Unicode mode and byteIndex==0
  1053. * end of input and no truncated input
  1054. */
  1055. if( U_SUCCESS(*pErrorCode) &&
  1056. !inDirectMode && byteIndex==0 &&
  1057. pArgs->flush && source>=sourceLimit
  1058. ) {
  1059. if(base64Counter==-1) {
  1060. /* & at the very end of the input */
  1061. /* make the ampersand the reported sequence */
  1062. bytes[0]=AMPERSAND;
  1063. byteIndex=1;
  1064. }
  1065. /* else if(base64Counter!=-1) byteIndex remains 0 because there is no particular byte sequence */
  1066. inDirectMode=true; /* avoid looping */
  1067. *pErrorCode=U_TRUNCATED_CHAR_FOUND;
  1068. }
  1069. /* set the converter state back into UConverter */
  1070. cnv->toUnicodeStatus=((uint32_t)inDirectMode<<24)|((uint32_t)((uint8_t)base64Counter)<<16)|(uint32_t)bits;
  1071. cnv->toULength=byteIndex;
  1072. /* write back the updated pointers */
  1073. pArgs->source=(const char *)source;
  1074. pArgs->target=target;
  1075. pArgs->offsets=offsets;
  1076. return;
  1077. }
  1078. static void U_CALLCONV
  1079. _IMAPFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
  1080. UErrorCode *pErrorCode) {
  1081. UConverter *cnv;
  1082. const char16_t *source, *sourceLimit;
  1083. uint8_t *target, *targetLimit;
  1084. int32_t *offsets;
  1085. int32_t length, targetCapacity, sourceIndex;
  1086. char16_t c;
  1087. uint8_t b;
  1088. /* UTF-7 state */
  1089. uint8_t bits;
  1090. int8_t base64Counter;
  1091. UBool inDirectMode;
  1092. /* set up the local pointers */
  1093. cnv=pArgs->converter;
  1094. /* set up the local pointers */
  1095. source=pArgs->source;
  1096. sourceLimit=pArgs->sourceLimit;
  1097. target=(uint8_t *)pArgs->target;
  1098. targetLimit=(uint8_t *)pArgs->targetLimit;
  1099. offsets=pArgs->offsets;
  1100. /* get the state machine state */
  1101. {
  1102. uint32_t status=cnv->fromUnicodeStatus;
  1103. inDirectMode=(UBool)((status>>24)&1);
  1104. base64Counter=(int8_t)(status>>16);
  1105. bits=(uint8_t)status;
  1106. }
  1107. /* UTF-7 always encodes UTF-16 code units, therefore we need only a simple sourceIndex */
  1108. sourceIndex=0;
  1109. if(inDirectMode) {
  1110. directMode:
  1111. length=(int32_t)(sourceLimit-source);
  1112. targetCapacity=(int32_t)(targetLimit-target);
  1113. if(length>targetCapacity) {
  1114. length=targetCapacity;
  1115. }
  1116. while(length>0) {
  1117. c=*source++;
  1118. /* encode 0x20..0x7e except '&' directly */
  1119. if(inSetDIMAP(c)) {
  1120. /* encode directly */
  1121. *target++=(uint8_t)c;
  1122. if(offsets!=nullptr) {
  1123. *offsets++=sourceIndex++;
  1124. }
  1125. } else if(c==AMPERSAND) {
  1126. /* output &- for & */
  1127. *target++=AMPERSAND;
  1128. if(target<targetLimit) {
  1129. *target++=MINUS;
  1130. if(offsets!=nullptr) {
  1131. *offsets++=sourceIndex;
  1132. *offsets++=sourceIndex++;
  1133. }
  1134. /* realign length and targetCapacity */
  1135. goto directMode;
  1136. } else {
  1137. if(offsets!=nullptr) {
  1138. *offsets++=sourceIndex++;
  1139. }
  1140. cnv->charErrorBuffer[0]=MINUS;
  1141. cnv->charErrorBufferLength=1;
  1142. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1143. break;
  1144. }
  1145. } else {
  1146. /* un-read this character and switch to Unicode Mode */
  1147. --source;
  1148. *target++=AMPERSAND;
  1149. if(offsets!=nullptr) {
  1150. *offsets++=sourceIndex;
  1151. }
  1152. inDirectMode=false;
  1153. base64Counter=0;
  1154. goto unicodeMode;
  1155. }
  1156. --length;
  1157. }
  1158. if(source<sourceLimit && target>=targetLimit) {
  1159. /* target is full */
  1160. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1161. }
  1162. } else {
  1163. unicodeMode:
  1164. while(source<sourceLimit) {
  1165. if(target<targetLimit) {
  1166. c=*source++;
  1167. if(isLegalIMAP(c)) {
  1168. /* encode directly */
  1169. inDirectMode=true;
  1170. /* trick: back out this character to make this easier */
  1171. --source;
  1172. /* terminate the base64 sequence */
  1173. if(base64Counter!=0) {
  1174. /* write remaining bits for the previous character */
  1175. *target++=TO_BASE64_IMAP(bits);
  1176. if(offsets!=nullptr) {
  1177. *offsets++=sourceIndex-1;
  1178. }
  1179. }
  1180. /* need to terminate with a minus */
  1181. if(target<targetLimit) {
  1182. *target++=MINUS;
  1183. if(offsets!=nullptr) {
  1184. *offsets++=sourceIndex-1;
  1185. }
  1186. } else {
  1187. cnv->charErrorBuffer[0]=MINUS;
  1188. cnv->charErrorBufferLength=1;
  1189. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1190. break;
  1191. }
  1192. goto directMode;
  1193. } else {
  1194. /*
  1195. * base64 this character:
  1196. * Output 2 or 3 base64 bytes for the remaining bits of the previous character
  1197. * and the bits of this character, each implicitly in UTF-16BE.
  1198. *
  1199. * Here, bits is an 8-bit variable because only 6 bits need to be kept from one
  1200. * character to the next. The actual 2 or 4 bits are shifted to the left edge
  1201. * of the 6-bits field 5..0 to make the termination of the base64 sequence easier.
  1202. */
  1203. switch(base64Counter) {
  1204. case 0:
  1205. b=(uint8_t)(c>>10);
  1206. *target++=TO_BASE64_IMAP(b);
  1207. if(target<targetLimit) {
  1208. b=(uint8_t)((c>>4)&0x3f);
  1209. *target++=TO_BASE64_IMAP(b);
  1210. if(offsets!=nullptr) {
  1211. *offsets++=sourceIndex;
  1212. *offsets++=sourceIndex++;
  1213. }
  1214. } else {
  1215. if(offsets!=nullptr) {
  1216. *offsets++=sourceIndex++;
  1217. }
  1218. b=(uint8_t)((c>>4)&0x3f);
  1219. cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
  1220. cnv->charErrorBufferLength=1;
  1221. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1222. }
  1223. bits=(uint8_t)((c&15)<<2);
  1224. base64Counter=1;
  1225. break;
  1226. case 1:
  1227. b=(uint8_t)(bits|(c>>14));
  1228. *target++=TO_BASE64_IMAP(b);
  1229. if(target<targetLimit) {
  1230. b=(uint8_t)((c>>8)&0x3f);
  1231. *target++=TO_BASE64_IMAP(b);
  1232. if(target<targetLimit) {
  1233. b=(uint8_t)((c>>2)&0x3f);
  1234. *target++=TO_BASE64_IMAP(b);
  1235. if(offsets!=nullptr) {
  1236. *offsets++=sourceIndex;
  1237. *offsets++=sourceIndex;
  1238. *offsets++=sourceIndex++;
  1239. }
  1240. } else {
  1241. if(offsets!=nullptr) {
  1242. *offsets++=sourceIndex;
  1243. *offsets++=sourceIndex++;
  1244. }
  1245. b=(uint8_t)((c>>2)&0x3f);
  1246. cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
  1247. cnv->charErrorBufferLength=1;
  1248. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1249. }
  1250. } else {
  1251. if(offsets!=nullptr) {
  1252. *offsets++=sourceIndex++;
  1253. }
  1254. b=(uint8_t)((c>>8)&0x3f);
  1255. cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
  1256. b=(uint8_t)((c>>2)&0x3f);
  1257. cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
  1258. cnv->charErrorBufferLength=2;
  1259. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1260. }
  1261. bits=(uint8_t)((c&3)<<4);
  1262. base64Counter=2;
  1263. break;
  1264. case 2:
  1265. b=(uint8_t)(bits|(c>>12));
  1266. *target++=TO_BASE64_IMAP(b);
  1267. if(target<targetLimit) {
  1268. b=(uint8_t)((c>>6)&0x3f);
  1269. *target++=TO_BASE64_IMAP(b);
  1270. if(target<targetLimit) {
  1271. b=(uint8_t)(c&0x3f);
  1272. *target++=TO_BASE64_IMAP(b);
  1273. if(offsets!=nullptr) {
  1274. *offsets++=sourceIndex;
  1275. *offsets++=sourceIndex;
  1276. *offsets++=sourceIndex++;
  1277. }
  1278. } else {
  1279. if(offsets!=nullptr) {
  1280. *offsets++=sourceIndex;
  1281. *offsets++=sourceIndex++;
  1282. }
  1283. b=(uint8_t)(c&0x3f);
  1284. cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
  1285. cnv->charErrorBufferLength=1;
  1286. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1287. }
  1288. } else {
  1289. if(offsets!=nullptr) {
  1290. *offsets++=sourceIndex++;
  1291. }
  1292. b=(uint8_t)((c>>6)&0x3f);
  1293. cnv->charErrorBuffer[0]=TO_BASE64_IMAP(b);
  1294. b=(uint8_t)(c&0x3f);
  1295. cnv->charErrorBuffer[1]=TO_BASE64_IMAP(b);
  1296. cnv->charErrorBufferLength=2;
  1297. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1298. }
  1299. bits=0;
  1300. base64Counter=0;
  1301. break;
  1302. default:
  1303. /* will never occur */
  1304. break;
  1305. }
  1306. }
  1307. } else {
  1308. /* target is full */
  1309. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1310. break;
  1311. }
  1312. }
  1313. }
  1314. if(pArgs->flush && source>=sourceLimit) {
  1315. /* flush remaining bits to the target */
  1316. if(!inDirectMode) {
  1317. if(base64Counter!=0) {
  1318. if(target<targetLimit) {
  1319. *target++=TO_BASE64_IMAP(bits);
  1320. if(offsets!=nullptr) {
  1321. *offsets++=sourceIndex-1;
  1322. }
  1323. } else {
  1324. cnv->charErrorBuffer[cnv->charErrorBufferLength++]=TO_BASE64_IMAP(bits);
  1325. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1326. }
  1327. }
  1328. /* need to terminate with a minus */
  1329. if(target<targetLimit) {
  1330. *target++=MINUS;
  1331. if(offsets!=nullptr) {
  1332. *offsets++=sourceIndex-1;
  1333. }
  1334. } else {
  1335. cnv->charErrorBuffer[cnv->charErrorBufferLength++]=MINUS;
  1336. *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1337. }
  1338. }
  1339. /* reset the state for the next conversion */
  1340. cnv->fromUnicodeStatus=(cnv->fromUnicodeStatus&0xf0000000)|0x1000000; /* keep version, inDirectMode=true */
  1341. } else {
  1342. /* set the converter state back into UConverter */
  1343. cnv->fromUnicodeStatus=
  1344. (cnv->fromUnicodeStatus&0xf0000000)| /* keep version*/
  1345. ((uint32_t)inDirectMode<<24)|((uint32_t)base64Counter<<16)|(uint32_t)bits;
  1346. }
  1347. /* write back the updated pointers */
  1348. pArgs->source=source;
  1349. pArgs->target=(char *)target;
  1350. pArgs->offsets=offsets;
  1351. return;
  1352. }
  1353. U_CDECL_END
  1354. static const UConverterImpl _IMAPImpl={
  1355. UCNV_IMAP_MAILBOX,
  1356. nullptr,
  1357. nullptr,
  1358. _UTF7Open,
  1359. nullptr,
  1360. _UTF7Reset,
  1361. _IMAPToUnicodeWithOffsets,
  1362. _IMAPToUnicodeWithOffsets,
  1363. _IMAPFromUnicodeWithOffsets,
  1364. _IMAPFromUnicodeWithOffsets,
  1365. nullptr,
  1366. nullptr,
  1367. nullptr,
  1368. nullptr, /* we don't need writeSub() because we never call a callback at fromUnicode() */
  1369. nullptr,
  1370. ucnv_getCompleteUnicodeSet,
  1371. nullptr,
  1372. nullptr
  1373. };
  1374. static const UConverterStaticData _IMAPStaticData={
  1375. sizeof(UConverterStaticData),
  1376. "IMAP-mailbox-name",
  1377. 0, /* TODO CCSID for IMAP-mailbox-name */
  1378. UCNV_IBM, UCNV_IMAP_MAILBOX,
  1379. 1, 4,
  1380. { 0x3f, 0, 0, 0 }, 1, /* the subchar is not used */
  1381. false, false,
  1382. 0,
  1383. 0,
  1384. { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1385. };
  1386. const UConverterSharedData _IMAPData=
  1387. UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_IMAPStaticData, &_IMAPImpl);
  1388. #endif