uiter.cpp 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2002-2012, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: uiter.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2002jan18
  16. * created by: Markus W. Scherer
  17. */
  18. #include "unicode/utypes.h"
  19. #include "unicode/ustring.h"
  20. #include "unicode/chariter.h"
  21. #include "unicode/rep.h"
  22. #include "unicode/uiter.h"
  23. #include "unicode/utf.h"
  24. #include "unicode/utf8.h"
  25. #include "unicode/utf16.h"
  26. #include "cstring.h"
  27. U_NAMESPACE_USE
  28. #define IS_EVEN(n) (((n)&1)==0)
  29. #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
  30. U_CDECL_BEGIN
  31. /* No-Op UCharIterator implementation for illegal input --------------------- */
  32. static int32_t U_CALLCONV
  33. noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
  34. return 0;
  35. }
  36. static int32_t U_CALLCONV
  37. noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
  38. return 0;
  39. }
  40. static UBool U_CALLCONV
  41. noopHasNext(UCharIterator * /*iter*/) {
  42. return false;
  43. }
  44. static UChar32 U_CALLCONV
  45. noopCurrent(UCharIterator * /*iter*/) {
  46. return U_SENTINEL;
  47. }
  48. static uint32_t U_CALLCONV
  49. noopGetState(const UCharIterator * /*iter*/) {
  50. return UITER_NO_STATE;
  51. }
  52. static void U_CALLCONV
  53. noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
  54. *pErrorCode=U_UNSUPPORTED_ERROR;
  55. }
  56. static const UCharIterator noopIterator={
  57. 0, 0, 0, 0, 0, 0,
  58. noopGetIndex,
  59. noopMove,
  60. noopHasNext,
  61. noopHasNext,
  62. noopCurrent,
  63. noopCurrent,
  64. noopCurrent,
  65. nullptr,
  66. noopGetState,
  67. noopSetState
  68. };
  69. /* UCharIterator implementation for simple strings -------------------------- */
  70. /*
  71. * This is an implementation of a code unit (char16_t) iterator
  72. * for char16_t * strings.
  73. *
  74. * The UCharIterator.context field holds a pointer to the string.
  75. */
  76. static int32_t U_CALLCONV
  77. stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  78. switch(origin) {
  79. case UITER_ZERO:
  80. return 0;
  81. case UITER_START:
  82. return iter->start;
  83. case UITER_CURRENT:
  84. return iter->index;
  85. case UITER_LIMIT:
  86. return iter->limit;
  87. case UITER_LENGTH:
  88. return iter->length;
  89. default:
  90. /* not a valid origin */
  91. /* Should never get here! */
  92. return -1;
  93. }
  94. }
  95. static int32_t U_CALLCONV
  96. stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  97. int32_t pos;
  98. switch(origin) {
  99. case UITER_ZERO:
  100. pos=delta;
  101. break;
  102. case UITER_START:
  103. pos=iter->start+delta;
  104. break;
  105. case UITER_CURRENT:
  106. pos=iter->index+delta;
  107. break;
  108. case UITER_LIMIT:
  109. pos=iter->limit+delta;
  110. break;
  111. case UITER_LENGTH:
  112. pos=iter->length+delta;
  113. break;
  114. default:
  115. return -1; /* Error */
  116. }
  117. if(pos<iter->start) {
  118. pos=iter->start;
  119. } else if(pos>iter->limit) {
  120. pos=iter->limit;
  121. }
  122. return iter->index=pos;
  123. }
  124. static UBool U_CALLCONV
  125. stringIteratorHasNext(UCharIterator *iter) {
  126. return iter->index<iter->limit;
  127. }
  128. static UBool U_CALLCONV
  129. stringIteratorHasPrevious(UCharIterator *iter) {
  130. return iter->index>iter->start;
  131. }
  132. static UChar32 U_CALLCONV
  133. stringIteratorCurrent(UCharIterator *iter) {
  134. if(iter->index<iter->limit) {
  135. return ((const char16_t *)(iter->context))[iter->index];
  136. } else {
  137. return U_SENTINEL;
  138. }
  139. }
  140. static UChar32 U_CALLCONV
  141. stringIteratorNext(UCharIterator *iter) {
  142. if(iter->index<iter->limit) {
  143. return ((const char16_t *)(iter->context))[iter->index++];
  144. } else {
  145. return U_SENTINEL;
  146. }
  147. }
  148. static UChar32 U_CALLCONV
  149. stringIteratorPrevious(UCharIterator *iter) {
  150. if(iter->index>iter->start) {
  151. return ((const char16_t *)(iter->context))[--iter->index];
  152. } else {
  153. return U_SENTINEL;
  154. }
  155. }
  156. static uint32_t U_CALLCONV
  157. stringIteratorGetState(const UCharIterator *iter) {
  158. return (uint32_t)iter->index;
  159. }
  160. static void U_CALLCONV
  161. stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
  162. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  163. /* do nothing */
  164. } else if(iter==nullptr) {
  165. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  166. } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
  167. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  168. } else {
  169. iter->index=(int32_t)state;
  170. }
  171. }
  172. static const UCharIterator stringIterator={
  173. 0, 0, 0, 0, 0, 0,
  174. stringIteratorGetIndex,
  175. stringIteratorMove,
  176. stringIteratorHasNext,
  177. stringIteratorHasPrevious,
  178. stringIteratorCurrent,
  179. stringIteratorNext,
  180. stringIteratorPrevious,
  181. nullptr,
  182. stringIteratorGetState,
  183. stringIteratorSetState
  184. };
  185. U_CAPI void U_EXPORT2
  186. uiter_setString(UCharIterator *iter, const char16_t *s, int32_t length) {
  187. if(iter!=0) {
  188. if(s!=0 && length>=-1) {
  189. *iter=stringIterator;
  190. iter->context=s;
  191. if(length>=0) {
  192. iter->length=length;
  193. } else {
  194. iter->length=u_strlen(s);
  195. }
  196. iter->limit=iter->length;
  197. } else {
  198. *iter=noopIterator;
  199. }
  200. }
  201. }
  202. /* UCharIterator implementation for UTF-16BE strings ------------------------ */
  203. /*
  204. * This is an implementation of a code unit (char16_t) iterator
  205. * for UTF-16BE strings, i.e., strings in byte-vectors where
  206. * each char16_t is stored as a big-endian pair of bytes.
  207. *
  208. * The UCharIterator.context field holds a pointer to the string.
  209. * Everything works just like with a normal char16_t iterator (uiter_setString),
  210. * except that UChars are assembled from byte pairs.
  211. */
  212. /* internal helper function */
  213. static inline UChar32
  214. utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
  215. const uint8_t *p=(const uint8_t *)iter->context;
  216. return ((char16_t)p[2*index]<<8)|(char16_t)p[2*index+1];
  217. }
  218. static UChar32 U_CALLCONV
  219. utf16BEIteratorCurrent(UCharIterator *iter) {
  220. int32_t index;
  221. if((index=iter->index)<iter->limit) {
  222. return utf16BEIteratorGet(iter, index);
  223. } else {
  224. return U_SENTINEL;
  225. }
  226. }
  227. static UChar32 U_CALLCONV
  228. utf16BEIteratorNext(UCharIterator *iter) {
  229. int32_t index;
  230. if((index=iter->index)<iter->limit) {
  231. iter->index=index+1;
  232. return utf16BEIteratorGet(iter, index);
  233. } else {
  234. return U_SENTINEL;
  235. }
  236. }
  237. static UChar32 U_CALLCONV
  238. utf16BEIteratorPrevious(UCharIterator *iter) {
  239. int32_t index;
  240. if((index=iter->index)>iter->start) {
  241. iter->index=--index;
  242. return utf16BEIteratorGet(iter, index);
  243. } else {
  244. return U_SENTINEL;
  245. }
  246. }
  247. static const UCharIterator utf16BEIterator={
  248. 0, 0, 0, 0, 0, 0,
  249. stringIteratorGetIndex,
  250. stringIteratorMove,
  251. stringIteratorHasNext,
  252. stringIteratorHasPrevious,
  253. utf16BEIteratorCurrent,
  254. utf16BEIteratorNext,
  255. utf16BEIteratorPrevious,
  256. nullptr,
  257. stringIteratorGetState,
  258. stringIteratorSetState
  259. };
  260. /*
  261. * Count the number of UChars in a UTF-16BE string before a terminating char16_t NUL,
  262. * i.e., before a pair of 0 bytes where the first 0 byte is at an even
  263. * offset from s.
  264. */
  265. static int32_t
  266. utf16BE_strlen(const char *s) {
  267. if(IS_POINTER_EVEN(s)) {
  268. /*
  269. * even-aligned, call u_strlen(s)
  270. * we are probably on a little-endian machine, but searching for char16_t NUL
  271. * does not care about endianness
  272. */
  273. return u_strlen((const char16_t *)s);
  274. } else {
  275. /* odd-aligned, search for pair of 0 bytes */
  276. const char *p=s;
  277. while(!(*p==0 && p[1]==0)) {
  278. p+=2;
  279. }
  280. return (int32_t)((p-s)/2);
  281. }
  282. }
  283. U_CAPI void U_EXPORT2
  284. uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
  285. if(iter!=nullptr) {
  286. /* allow only even-length strings (the input length counts bytes) */
  287. if(s!=nullptr && (length==-1 || (length>=0 && IS_EVEN(length)))) {
  288. /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
  289. length>>=1;
  290. if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
  291. /* big-endian machine and 2-aligned UTF-16BE string: use normal char16_t iterator */
  292. uiter_setString(iter, (const char16_t *)s, length);
  293. return;
  294. }
  295. *iter=utf16BEIterator;
  296. iter->context=s;
  297. if(length>=0) {
  298. iter->length=length;
  299. } else {
  300. iter->length=utf16BE_strlen(s);
  301. }
  302. iter->limit=iter->length;
  303. } else {
  304. *iter=noopIterator;
  305. }
  306. }
  307. }
  308. /* UCharIterator wrapper around CharacterIterator --------------------------- */
  309. /*
  310. * This is wrapper code around a C++ CharacterIterator to
  311. * look like a C UCharIterator.
  312. *
  313. * The UCharIterator.context field holds a pointer to the CharacterIterator.
  314. */
  315. static int32_t U_CALLCONV
  316. characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  317. switch(origin) {
  318. case UITER_ZERO:
  319. return 0;
  320. case UITER_START:
  321. return ((CharacterIterator *)(iter->context))->startIndex();
  322. case UITER_CURRENT:
  323. return ((CharacterIterator *)(iter->context))->getIndex();
  324. case UITER_LIMIT:
  325. return ((CharacterIterator *)(iter->context))->endIndex();
  326. case UITER_LENGTH:
  327. return ((CharacterIterator *)(iter->context))->getLength();
  328. default:
  329. /* not a valid origin */
  330. /* Should never get here! */
  331. return -1;
  332. }
  333. }
  334. static int32_t U_CALLCONV
  335. characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  336. switch(origin) {
  337. case UITER_ZERO:
  338. ((CharacterIterator *)(iter->context))->setIndex(delta);
  339. return ((CharacterIterator *)(iter->context))->getIndex();
  340. case UITER_START:
  341. case UITER_CURRENT:
  342. case UITER_LIMIT:
  343. return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
  344. case UITER_LENGTH:
  345. ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
  346. return ((CharacterIterator *)(iter->context))->getIndex();
  347. default:
  348. /* not a valid origin */
  349. /* Should never get here! */
  350. return -1;
  351. }
  352. }
  353. static UBool U_CALLCONV
  354. characterIteratorHasNext(UCharIterator *iter) {
  355. return ((CharacterIterator *)(iter->context))->hasNext();
  356. }
  357. static UBool U_CALLCONV
  358. characterIteratorHasPrevious(UCharIterator *iter) {
  359. return ((CharacterIterator *)(iter->context))->hasPrevious();
  360. }
  361. static UChar32 U_CALLCONV
  362. characterIteratorCurrent(UCharIterator *iter) {
  363. UChar32 c;
  364. c=((CharacterIterator *)(iter->context))->current();
  365. if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
  366. return c;
  367. } else {
  368. return U_SENTINEL;
  369. }
  370. }
  371. static UChar32 U_CALLCONV
  372. characterIteratorNext(UCharIterator *iter) {
  373. if(((CharacterIterator *)(iter->context))->hasNext()) {
  374. return ((CharacterIterator *)(iter->context))->nextPostInc();
  375. } else {
  376. return U_SENTINEL;
  377. }
  378. }
  379. static UChar32 U_CALLCONV
  380. characterIteratorPrevious(UCharIterator *iter) {
  381. if(((CharacterIterator *)(iter->context))->hasPrevious()) {
  382. return ((CharacterIterator *)(iter->context))->previous();
  383. } else {
  384. return U_SENTINEL;
  385. }
  386. }
  387. static uint32_t U_CALLCONV
  388. characterIteratorGetState(const UCharIterator *iter) {
  389. return ((CharacterIterator *)(iter->context))->getIndex();
  390. }
  391. static void U_CALLCONV
  392. characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
  393. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  394. /* do nothing */
  395. } else if(iter==nullptr || iter->context==nullptr) {
  396. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  397. } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
  398. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  399. } else {
  400. ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
  401. }
  402. }
  403. static const UCharIterator characterIteratorWrapper={
  404. 0, 0, 0, 0, 0, 0,
  405. characterIteratorGetIndex,
  406. characterIteratorMove,
  407. characterIteratorHasNext,
  408. characterIteratorHasPrevious,
  409. characterIteratorCurrent,
  410. characterIteratorNext,
  411. characterIteratorPrevious,
  412. nullptr,
  413. characterIteratorGetState,
  414. characterIteratorSetState
  415. };
  416. U_CAPI void U_EXPORT2
  417. uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
  418. if(iter!=0) {
  419. if(charIter!=0) {
  420. *iter=characterIteratorWrapper;
  421. iter->context=charIter;
  422. } else {
  423. *iter=noopIterator;
  424. }
  425. }
  426. }
  427. /* UCharIterator wrapper around Replaceable --------------------------------- */
  428. /*
  429. * This is an implementation of a code unit (char16_t) iterator
  430. * based on a Replaceable object.
  431. *
  432. * The UCharIterator.context field holds a pointer to the Replaceable.
  433. * UCharIterator.length and UCharIterator.index hold Replaceable.length()
  434. * and the iteration index.
  435. */
  436. static UChar32 U_CALLCONV
  437. replaceableIteratorCurrent(UCharIterator *iter) {
  438. if(iter->index<iter->limit) {
  439. return ((Replaceable *)(iter->context))->charAt(iter->index);
  440. } else {
  441. return U_SENTINEL;
  442. }
  443. }
  444. static UChar32 U_CALLCONV
  445. replaceableIteratorNext(UCharIterator *iter) {
  446. if(iter->index<iter->limit) {
  447. return ((Replaceable *)(iter->context))->charAt(iter->index++);
  448. } else {
  449. return U_SENTINEL;
  450. }
  451. }
  452. static UChar32 U_CALLCONV
  453. replaceableIteratorPrevious(UCharIterator *iter) {
  454. if(iter->index>iter->start) {
  455. return ((Replaceable *)(iter->context))->charAt(--iter->index);
  456. } else {
  457. return U_SENTINEL;
  458. }
  459. }
  460. static const UCharIterator replaceableIterator={
  461. 0, 0, 0, 0, 0, 0,
  462. stringIteratorGetIndex,
  463. stringIteratorMove,
  464. stringIteratorHasNext,
  465. stringIteratorHasPrevious,
  466. replaceableIteratorCurrent,
  467. replaceableIteratorNext,
  468. replaceableIteratorPrevious,
  469. nullptr,
  470. stringIteratorGetState,
  471. stringIteratorSetState
  472. };
  473. U_CAPI void U_EXPORT2
  474. uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
  475. if(iter!=0) {
  476. if(rep!=0) {
  477. *iter=replaceableIterator;
  478. iter->context=rep;
  479. iter->limit=iter->length=rep->length();
  480. } else {
  481. *iter=noopIterator;
  482. }
  483. }
  484. }
  485. /* UCharIterator implementation for UTF-8 strings --------------------------- */
  486. /*
  487. * Possible, probably necessary only for an implementation for arbitrary
  488. * converters:
  489. * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
  490. * This would require to turn reservedFn into a close function and
  491. * to introduce a uiter_close(iter).
  492. */
  493. #define UITER_CNV_CAPACITY 16
  494. /*
  495. * Minimal implementation:
  496. * Maintain a single-char16_t buffer for an additional surrogate.
  497. * The caller must not modify start and limit because they are used internally.
  498. *
  499. * Use UCharIterator fields as follows:
  500. * context pointer to UTF-8 string
  501. * length UTF-16 length of the string; -1 until lazy evaluation
  502. * start current UTF-8 index
  503. * index current UTF-16 index; may be -1="unknown" after setState()
  504. * limit UTF-8 length of the string
  505. * reservedField supplementary code point
  506. *
  507. * Since UCharIterator delivers 16-bit code units, the iteration can be
  508. * currently in the middle of the byte sequence for a supplementary code point.
  509. * In this case, reservedField will contain that code point and start will
  510. * point to after the corresponding byte sequence. The UTF-16 index will be
  511. * one less than what it would otherwise be corresponding to the UTF-8 index.
  512. * Otherwise, reservedField will be 0.
  513. */
  514. /*
  515. * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
  516. * Add implementations that do not call strlen() for iteration but check for NUL.
  517. */
  518. static int32_t U_CALLCONV
  519. utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  520. switch(origin) {
  521. case UITER_ZERO:
  522. case UITER_START:
  523. return 0;
  524. case UITER_CURRENT:
  525. if(iter->index<0) {
  526. /* the current UTF-16 index is unknown after setState(), count from the beginning */
  527. const uint8_t *s;
  528. UChar32 c;
  529. int32_t i, limit, index;
  530. s=(const uint8_t *)iter->context;
  531. i=index=0;
  532. limit=iter->start; /* count up to the UTF-8 index */
  533. while(i<limit) {
  534. U8_NEXT_OR_FFFD(s, i, limit, c);
  535. index+=U16_LENGTH(c);
  536. }
  537. iter->start=i; /* just in case setState() did not get us to a code point boundary */
  538. if(i==iter->limit) {
  539. iter->length=index; /* in case it was <0 or wrong */
  540. }
  541. if(iter->reservedField!=0) {
  542. --index; /* we are in the middle of a supplementary code point */
  543. }
  544. iter->index=index;
  545. }
  546. return iter->index;
  547. case UITER_LIMIT:
  548. case UITER_LENGTH:
  549. if(iter->length<0) {
  550. const uint8_t *s;
  551. UChar32 c;
  552. int32_t i, limit, length;
  553. s=(const uint8_t *)iter->context;
  554. if(iter->index<0) {
  555. /*
  556. * the current UTF-16 index is unknown after setState(),
  557. * we must first count from the beginning to here
  558. */
  559. i=length=0;
  560. limit=iter->start;
  561. /* count from the beginning to the current index */
  562. while(i<limit) {
  563. U8_NEXT_OR_FFFD(s, i, limit, c);
  564. length+=U16_LENGTH(c);
  565. }
  566. /* assume i==limit==iter->start, set the UTF-16 index */
  567. iter->start=i; /* just in case setState() did not get us to a code point boundary */
  568. iter->index= iter->reservedField!=0 ? length-1 : length;
  569. } else {
  570. i=iter->start;
  571. length=iter->index;
  572. if(iter->reservedField!=0) {
  573. ++length;
  574. }
  575. }
  576. /* count from the current index to the end */
  577. limit=iter->limit;
  578. while(i<limit) {
  579. U8_NEXT_OR_FFFD(s, i, limit, c);
  580. length+=U16_LENGTH(c);
  581. }
  582. iter->length=length;
  583. }
  584. return iter->length;
  585. default:
  586. /* not a valid origin */
  587. /* Should never get here! */
  588. return -1;
  589. }
  590. }
  591. static int32_t U_CALLCONV
  592. utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) UPRV_NO_SANITIZE_UNDEFINED {
  593. const uint8_t *s;
  594. UChar32 c;
  595. int32_t pos; /* requested UTF-16 index */
  596. int32_t i; /* UTF-8 index */
  597. UBool havePos;
  598. /* calculate the requested UTF-16 index */
  599. switch(origin) {
  600. case UITER_ZERO:
  601. case UITER_START:
  602. pos=delta;
  603. havePos=true;
  604. /* iter->index<0 (unknown) is possible */
  605. break;
  606. case UITER_CURRENT:
  607. if(iter->index>=0) {
  608. pos=iter->index+delta;
  609. havePos=true;
  610. } else {
  611. /* the current UTF-16 index is unknown after setState(), use only delta */
  612. pos=0;
  613. havePos=false;
  614. }
  615. break;
  616. case UITER_LIMIT:
  617. case UITER_LENGTH:
  618. if(iter->length>=0) {
  619. pos=iter->length+delta;
  620. havePos=true;
  621. } else {
  622. /* pin to the end, avoid counting the length */
  623. iter->index=-1;
  624. iter->start=iter->limit;
  625. iter->reservedField=0;
  626. if(delta>=0) {
  627. return UITER_UNKNOWN_INDEX;
  628. } else {
  629. /* the current UTF-16 index is unknown, use only delta */
  630. pos=0;
  631. havePos=false;
  632. }
  633. }
  634. break;
  635. default:
  636. return -1; /* Error */
  637. }
  638. if(havePos) {
  639. /* shortcuts: pinning to the edges of the string */
  640. if(pos<=0) {
  641. iter->index=iter->start=iter->reservedField=0;
  642. return 0;
  643. } else if(iter->length>=0 && pos>=iter->length) {
  644. iter->index=iter->length;
  645. iter->start=iter->limit;
  646. iter->reservedField=0;
  647. return iter->index;
  648. }
  649. /* minimize the number of U8_NEXT/PREV operations */
  650. if(iter->index<0 || pos<iter->index/2) {
  651. /* go forward from the start instead of backward from the current index */
  652. iter->index=iter->start=iter->reservedField=0;
  653. } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
  654. /*
  655. * if we have the UTF-16 index and length and the new position is
  656. * closer to the end than the current index,
  657. * then go backward from the end instead of forward from the current index
  658. */
  659. iter->index=iter->length;
  660. iter->start=iter->limit;
  661. iter->reservedField=0;
  662. }
  663. delta=pos-iter->index;
  664. if(delta==0) {
  665. return iter->index; /* nothing to do */
  666. }
  667. } else {
  668. /* move relative to unknown UTF-16 index */
  669. if(delta==0) {
  670. return UITER_UNKNOWN_INDEX; /* nothing to do */
  671. } else if(-delta>=iter->start) {
  672. /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
  673. iter->index=iter->start=iter->reservedField=0;
  674. return 0;
  675. } else if(delta>=(iter->limit-iter->start)) {
  676. /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
  677. iter->index=iter->length; /* may or may not be <0 (unknown) */
  678. iter->start=iter->limit;
  679. iter->reservedField=0;
  680. return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
  681. }
  682. }
  683. /* delta!=0 */
  684. /* move towards the requested position, pin to the edges of the string */
  685. s=(const uint8_t *)iter->context;
  686. pos=iter->index; /* could be <0 (unknown) */
  687. i=iter->start;
  688. if(delta>0) {
  689. /* go forward */
  690. int32_t limit=iter->limit;
  691. if(iter->reservedField!=0) {
  692. iter->reservedField=0;
  693. ++pos;
  694. --delta;
  695. }
  696. while(delta>0 && i<limit) {
  697. U8_NEXT_OR_FFFD(s, i, limit, c);
  698. if(c<=0xffff) {
  699. ++pos;
  700. --delta;
  701. } else if(delta>=2) {
  702. pos+=2;
  703. delta-=2;
  704. } else /* delta==1 */ {
  705. /* stop in the middle of a supplementary code point */
  706. iter->reservedField=c;
  707. ++pos;
  708. break; /* delta=0; */
  709. }
  710. }
  711. if(i==limit) {
  712. if(iter->length<0 && iter->index>=0) {
  713. iter->length= iter->reservedField==0 ? pos : pos+1;
  714. } else if(iter->index<0 && iter->length>=0) {
  715. iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
  716. }
  717. }
  718. } else /* delta<0 */ {
  719. /* go backward */
  720. if(iter->reservedField!=0) {
  721. iter->reservedField=0;
  722. i-=4; /* we stayed behind the supplementary code point; go before it now */
  723. --pos;
  724. ++delta;
  725. }
  726. while(delta<0 && i>0) {
  727. U8_PREV_OR_FFFD(s, 0, i, c);
  728. if(c<=0xffff) {
  729. --pos;
  730. ++delta;
  731. } else if(delta<=-2) {
  732. pos-=2;
  733. delta+=2;
  734. } else /* delta==-1 */ {
  735. /* stop in the middle of a supplementary code point */
  736. i+=4; /* back to behind this supplementary code point for consistent state */
  737. iter->reservedField=c;
  738. --pos;
  739. break; /* delta=0; */
  740. }
  741. }
  742. }
  743. iter->start=i;
  744. if(iter->index>=0) {
  745. return iter->index=pos;
  746. } else {
  747. /* we started with index<0 (unknown) so pos is bogus */
  748. if(i<=1) {
  749. return iter->index=i; /* reached the beginning */
  750. } else {
  751. /* we still don't know the UTF-16 index */
  752. return UITER_UNKNOWN_INDEX;
  753. }
  754. }
  755. }
  756. static UBool U_CALLCONV
  757. utf8IteratorHasNext(UCharIterator *iter) {
  758. return iter->start<iter->limit || iter->reservedField!=0;
  759. }
  760. static UBool U_CALLCONV
  761. utf8IteratorHasPrevious(UCharIterator *iter) {
  762. return iter->start>0;
  763. }
  764. static UChar32 U_CALLCONV
  765. utf8IteratorCurrent(UCharIterator *iter) {
  766. if(iter->reservedField!=0) {
  767. return U16_TRAIL(iter->reservedField);
  768. } else if(iter->start<iter->limit) {
  769. const uint8_t *s=(const uint8_t *)iter->context;
  770. UChar32 c;
  771. int32_t i=iter->start;
  772. U8_NEXT_OR_FFFD(s, i, iter->limit, c);
  773. if(c<=0xffff) {
  774. return c;
  775. } else {
  776. return U16_LEAD(c);
  777. }
  778. } else {
  779. return U_SENTINEL;
  780. }
  781. }
  782. static UChar32 U_CALLCONV
  783. utf8IteratorNext(UCharIterator *iter) {
  784. int32_t index;
  785. if(iter->reservedField!=0) {
  786. char16_t trail=U16_TRAIL(iter->reservedField);
  787. iter->reservedField=0;
  788. if((index=iter->index)>=0) {
  789. iter->index=index+1;
  790. }
  791. return trail;
  792. } else if(iter->start<iter->limit) {
  793. const uint8_t *s=(const uint8_t *)iter->context;
  794. UChar32 c;
  795. U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
  796. if((index=iter->index)>=0) {
  797. iter->index=++index;
  798. if(iter->length<0 && iter->start==iter->limit) {
  799. iter->length= c<=0xffff ? index : index+1;
  800. }
  801. } else if(iter->start==iter->limit && iter->length>=0) {
  802. iter->index= c<=0xffff ? iter->length : iter->length-1;
  803. }
  804. if(c<=0xffff) {
  805. return c;
  806. } else {
  807. iter->reservedField=c;
  808. return U16_LEAD(c);
  809. }
  810. } else {
  811. return U_SENTINEL;
  812. }
  813. }
  814. static UChar32 U_CALLCONV
  815. utf8IteratorPrevious(UCharIterator *iter) {
  816. int32_t index;
  817. if(iter->reservedField!=0) {
  818. char16_t lead=U16_LEAD(iter->reservedField);
  819. iter->reservedField=0;
  820. iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
  821. if((index=iter->index)>0) {
  822. iter->index=index-1;
  823. }
  824. return lead;
  825. } else if(iter->start>0) {
  826. const uint8_t *s=(const uint8_t *)iter->context;
  827. UChar32 c;
  828. U8_PREV_OR_FFFD(s, 0, iter->start, c);
  829. if((index=iter->index)>0) {
  830. iter->index=index-1;
  831. } else if(iter->start<=1) {
  832. iter->index= c<=0xffff ? iter->start : iter->start+1;
  833. }
  834. if(c<=0xffff) {
  835. return c;
  836. } else {
  837. iter->start+=4; /* back to behind this supplementary code point for consistent state */
  838. iter->reservedField=c;
  839. return U16_TRAIL(c);
  840. }
  841. } else {
  842. return U_SENTINEL;
  843. }
  844. }
  845. static uint32_t U_CALLCONV
  846. utf8IteratorGetState(const UCharIterator *iter) {
  847. uint32_t state=(uint32_t)(iter->start<<1);
  848. if(iter->reservedField!=0) {
  849. state|=1;
  850. }
  851. return state;
  852. }
  853. static void U_CALLCONV
  854. utf8IteratorSetState(UCharIterator *iter,
  855. uint32_t state,
  856. UErrorCode *pErrorCode)
  857. {
  858. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  859. /* do nothing */
  860. } else if(iter==nullptr) {
  861. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  862. } else if(state==utf8IteratorGetState(iter)) {
  863. /* setting to the current state: no-op */
  864. } else {
  865. int32_t index=(int32_t)(state>>1); /* UTF-8 index */
  866. state&=1; /* 1 if in surrogate pair, must be index>=4 */
  867. if((state==0 ? index<0 : index<4) || iter->limit<index) {
  868. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  869. } else {
  870. iter->start=index; /* restore UTF-8 byte index */
  871. if(index<=1) {
  872. iter->index=index;
  873. } else {
  874. iter->index=-1; /* unknown UTF-16 index */
  875. }
  876. if(state==0) {
  877. iter->reservedField=0;
  878. } else {
  879. /* verified index>=4 above */
  880. UChar32 c;
  881. U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
  882. if(c<=0xffff) {
  883. *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  884. } else {
  885. iter->reservedField=c;
  886. }
  887. }
  888. }
  889. }
  890. }
  891. static const UCharIterator utf8Iterator={
  892. 0, 0, 0, 0, 0, 0,
  893. utf8IteratorGetIndex,
  894. utf8IteratorMove,
  895. utf8IteratorHasNext,
  896. utf8IteratorHasPrevious,
  897. utf8IteratorCurrent,
  898. utf8IteratorNext,
  899. utf8IteratorPrevious,
  900. nullptr,
  901. utf8IteratorGetState,
  902. utf8IteratorSetState
  903. };
  904. U_CAPI void U_EXPORT2
  905. uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
  906. if(iter!=0) {
  907. if(s!=0 && length>=-1) {
  908. *iter=utf8Iterator;
  909. iter->context=s;
  910. if(length>=0) {
  911. iter->limit=length;
  912. } else {
  913. iter->limit=(int32_t)uprv_strlen(s);
  914. }
  915. iter->length= iter->limit<=1 ? iter->limit : -1;
  916. } else {
  917. *iter=noopIterator;
  918. }
  919. }
  920. }
  921. /* Helper functions --------------------------------------------------------- */
  922. U_CAPI UChar32 U_EXPORT2
  923. uiter_current32(UCharIterator *iter) {
  924. UChar32 c, c2;
  925. c=iter->current(iter);
  926. if(U16_IS_SURROGATE(c)) {
  927. if(U16_IS_SURROGATE_LEAD(c)) {
  928. /*
  929. * go to the next code unit
  930. * we know that we are not at the limit because c!=U_SENTINEL
  931. */
  932. iter->move(iter, 1, UITER_CURRENT);
  933. if(U16_IS_TRAIL(c2=iter->current(iter))) {
  934. c=U16_GET_SUPPLEMENTARY(c, c2);
  935. }
  936. /* undo index movement */
  937. iter->move(iter, -1, UITER_CURRENT);
  938. } else {
  939. if(U16_IS_LEAD(c2=iter->previous(iter))) {
  940. c=U16_GET_SUPPLEMENTARY(c2, c);
  941. }
  942. if(c2>=0) {
  943. /* undo index movement */
  944. iter->move(iter, 1, UITER_CURRENT);
  945. }
  946. }
  947. }
  948. return c;
  949. }
  950. U_CAPI UChar32 U_EXPORT2
  951. uiter_next32(UCharIterator *iter) {
  952. UChar32 c, c2;
  953. c=iter->next(iter);
  954. if(U16_IS_LEAD(c)) {
  955. if(U16_IS_TRAIL(c2=iter->next(iter))) {
  956. c=U16_GET_SUPPLEMENTARY(c, c2);
  957. } else if(c2>=0) {
  958. /* unmatched first surrogate, undo index movement */
  959. iter->move(iter, -1, UITER_CURRENT);
  960. }
  961. }
  962. return c;
  963. }
  964. U_CAPI UChar32 U_EXPORT2
  965. uiter_previous32(UCharIterator *iter) {
  966. UChar32 c, c2;
  967. c=iter->previous(iter);
  968. if(U16_IS_TRAIL(c)) {
  969. if(U16_IS_LEAD(c2=iter->previous(iter))) {
  970. c=U16_GET_SUPPLEMENTARY(c2, c);
  971. } else if(c2>=0) {
  972. /* unmatched second surrogate, undo index movement */
  973. iter->move(iter, 1, UITER_CURRENT);
  974. }
  975. }
  976. return c;
  977. }
  978. U_CAPI uint32_t U_EXPORT2
  979. uiter_getState(const UCharIterator *iter) {
  980. if(iter==nullptr || iter->getState==nullptr) {
  981. return UITER_NO_STATE;
  982. } else {
  983. return iter->getState(iter);
  984. }
  985. }
  986. U_CAPI void U_EXPORT2
  987. uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
  988. if(pErrorCode==nullptr || U_FAILURE(*pErrorCode)) {
  989. /* do nothing */
  990. } else if(iter==nullptr) {
  991. *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  992. } else if(iter->setState==nullptr) {
  993. *pErrorCode=U_UNSUPPORTED_ERROR;
  994. } else {
  995. iter->setState(iter, state, pErrorCode);
  996. }
  997. }
  998. U_CDECL_END