bmpset.cpp 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. ******************************************************************************
  5. *
  6. * Copyright (C) 2007-2012, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. ******************************************************************************
  10. * file name: bmpset.cpp
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2007jan29
  16. * created by: Markus W. Scherer
  17. */
  18. #include "unicode/utypes.h"
  19. #include "unicode/uniset.h"
  20. #include "unicode/utf8.h"
  21. #include "unicode/utf16.h"
  22. #include "cmemory.h"
  23. #include "bmpset.h"
  24. #include "uassert.h"
  25. U_NAMESPACE_BEGIN
  26. BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
  27. list(parentList), listLength(parentListLength) {
  28. uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
  29. uprv_memset(table7FF, 0, sizeof(table7FF));
  30. uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
  31. /*
  32. * Set the list indexes for binary searches for
  33. * U+0800, U+1000, U+2000, .., U+F000, U+10000.
  34. * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
  35. * looked up in the bit tables.
  36. * The last pair of indexes is for finding supplementary code points.
  37. */
  38. list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
  39. int32_t i;
  40. for(i=1; i<=0x10; ++i) {
  41. list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
  42. }
  43. list4kStarts[0x11]=listLength-1;
  44. containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
  45. initBits();
  46. overrideIllegal();
  47. }
  48. BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
  49. containsFFFD(otherBMPSet.containsFFFD),
  50. list(newParentList), listLength(newParentListLength) {
  51. uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
  52. uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
  53. uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
  54. uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
  55. }
  56. BMPSet::~BMPSet() {
  57. }
  58. /*
  59. * Set bits in a bit rectangle in "vertical" bit organization.
  60. * start<limit<=0x800
  61. */
  62. static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
  63. U_ASSERT(start<limit);
  64. U_ASSERT(limit<=0x800);
  65. int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
  66. int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
  67. // Set one bit indicating an all-one block.
  68. uint32_t bits=(uint32_t)1<<lead;
  69. if((start+1)==limit) { // Single-character shortcut.
  70. table[trail]|=bits;
  71. return;
  72. }
  73. int32_t limitLead=limit>>6;
  74. int32_t limitTrail=limit&0x3f;
  75. if(lead==limitLead) {
  76. // Partial vertical bit column.
  77. while(trail<limitTrail) {
  78. table[trail++]|=bits;
  79. }
  80. } else {
  81. // Partial vertical bit column,
  82. // followed by a bit rectangle,
  83. // followed by another partial vertical bit column.
  84. if(trail>0) {
  85. do {
  86. table[trail++]|=bits;
  87. } while(trail<64);
  88. ++lead;
  89. }
  90. if(lead<limitLead) {
  91. bits=~(((unsigned)1<<lead)-1);
  92. if(limitLead<0x20) {
  93. bits&=((unsigned)1<<limitLead)-1;
  94. }
  95. for(trail=0; trail<64; ++trail) {
  96. table[trail]|=bits;
  97. }
  98. }
  99. // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
  100. // In that case, bits=1<<limitLead is undefined but the bits value
  101. // is not used because trail<limitTrail is already false.
  102. bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
  103. for(trail=0; trail<limitTrail; ++trail) {
  104. table[trail]|=bits;
  105. }
  106. }
  107. }
  108. void BMPSet::initBits() {
  109. UChar32 start, limit;
  110. int32_t listIndex=0;
  111. // Set latin1Contains[].
  112. do {
  113. start=list[listIndex++];
  114. if(listIndex<listLength) {
  115. limit=list[listIndex++];
  116. } else {
  117. limit=0x110000;
  118. }
  119. if(start>=0x100) {
  120. break;
  121. }
  122. do {
  123. latin1Contains[start++]=1;
  124. } while(start<limit && start<0x100);
  125. } while(limit<=0x100);
  126. // Find the first range overlapping with (or after) 80..FF again,
  127. // to include them in table7FF as well.
  128. for(listIndex=0;;) {
  129. start=list[listIndex++];
  130. if(listIndex<listLength) {
  131. limit=list[listIndex++];
  132. } else {
  133. limit=0x110000;
  134. }
  135. if(limit>0x80) {
  136. if(start<0x80) {
  137. start=0x80;
  138. }
  139. break;
  140. }
  141. }
  142. // Set table7FF[].
  143. while(start<0x800) {
  144. set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
  145. if(limit>0x800) {
  146. start=0x800;
  147. break;
  148. }
  149. start=list[listIndex++];
  150. if(listIndex<listLength) {
  151. limit=list[listIndex++];
  152. } else {
  153. limit=0x110000;
  154. }
  155. }
  156. // Set bmpBlockBits[].
  157. int32_t minStart=0x800;
  158. while(start<0x10000) {
  159. if(limit>0x10000) {
  160. limit=0x10000;
  161. }
  162. if(start<minStart) {
  163. start=minStart;
  164. }
  165. if(start<limit) { // Else: Another range entirely in a known mixed-value block.
  166. if(start&0x3f) {
  167. // Mixed-value block of 64 code points.
  168. start>>=6;
  169. bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
  170. start=(start+1)<<6; // Round up to the next block boundary.
  171. minStart=start; // Ignore further ranges in this block.
  172. }
  173. if(start<limit) {
  174. if(start<(limit&~0x3f)) {
  175. // Multiple all-ones blocks of 64 code points each.
  176. set32x64Bits(bmpBlockBits, start>>6, limit>>6);
  177. }
  178. if(limit&0x3f) {
  179. // Mixed-value block of 64 code points.
  180. limit>>=6;
  181. bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
  182. limit=(limit+1)<<6; // Round up to the next block boundary.
  183. minStart=limit; // Ignore further ranges in this block.
  184. }
  185. }
  186. }
  187. if(limit==0x10000) {
  188. break;
  189. }
  190. start=list[listIndex++];
  191. if(listIndex<listLength) {
  192. limit=list[listIndex++];
  193. } else {
  194. limit=0x110000;
  195. }
  196. }
  197. }
  198. /*
  199. * Override some bits and bytes to the result of contains(FFFD)
  200. * for faster validity checking at runtime.
  201. * No need to set 0 values where they were reset to 0 in the constructor
  202. * and not modified by initBits().
  203. * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
  204. * Need to set 0 values for surrogates D800..DFFF.
  205. */
  206. void BMPSet::overrideIllegal() {
  207. uint32_t bits, mask;
  208. int32_t i;
  209. if(containsFFFD) {
  210. bits=3; // Lead bytes 0xC0 and 0xC1.
  211. for(i=0; i<64; ++i) {
  212. table7FF[i]|=bits;
  213. }
  214. bits=1; // Lead byte 0xE0.
  215. for(i=0; i<32; ++i) { // First half of 4k block.
  216. bmpBlockBits[i]|=bits;
  217. }
  218. mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
  219. bits=1<<0xd;
  220. for(i=32; i<64; ++i) { // Second half of 4k block.
  221. bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
  222. }
  223. } else {
  224. mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
  225. for(i=32; i<64; ++i) { // Second half of 4k block.
  226. bmpBlockBits[i]&=mask;
  227. }
  228. }
  229. }
  230. int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
  231. /* Examples:
  232. findCodePoint(c)
  233. set list[] c=0 1 3 4 7 8
  234. === ============== ===========
  235. [] [110000] 0 0 0 0 0 0
  236. [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
  237. [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
  238. [:Any:] [0, 110000] 1 1 1 1 1 1
  239. */
  240. // Return the smallest i such that c < list[i]. Assume
  241. // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
  242. if (c < list[lo])
  243. return lo;
  244. // High runner test. c is often after the last range, so an
  245. // initial check for this condition pays off.
  246. if (lo >= hi || c >= list[hi-1])
  247. return hi;
  248. // invariant: c >= list[lo]
  249. // invariant: c < list[hi]
  250. for (;;) {
  251. int32_t i = (lo + hi) >> 1;
  252. if (i == lo) {
  253. break; // Found!
  254. } else if (c < list[i]) {
  255. hi = i;
  256. } else {
  257. lo = i;
  258. }
  259. }
  260. return hi;
  261. }
  262. UBool
  263. BMPSet::contains(UChar32 c) const {
  264. if((uint32_t)c<=0xff) {
  265. return (UBool)latin1Contains[c];
  266. } else if((uint32_t)c<=0x7ff) {
  267. return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
  268. } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
  269. int lead=c>>12;
  270. uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
  271. if(twoBits<=1) {
  272. // All 64 code points with the same bits 15..6
  273. // are either in the set or not.
  274. return (UBool)twoBits;
  275. } else {
  276. // Look up the code point in its 4k block of code points.
  277. return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
  278. }
  279. } else if((uint32_t)c<=0x10ffff) {
  280. // surrogate or supplementary code point
  281. return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
  282. } else {
  283. // Out-of-range code points get false, consistent with long-standing
  284. // behavior of UnicodeSet::contains(c).
  285. return false;
  286. }
  287. }
  288. /*
  289. * Check for sufficient length for trail unit for each surrogate pair.
  290. * Handle single surrogates as surrogate code points as usual in ICU.
  291. */
  292. const char16_t *
  293. BMPSet::span(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {
  294. char16_t c, c2;
  295. if(spanCondition) {
  296. // span
  297. do {
  298. c=*s;
  299. if(c<=0xff) {
  300. if(!latin1Contains[c]) {
  301. break;
  302. }
  303. } else if(c<=0x7ff) {
  304. if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
  305. break;
  306. }
  307. } else if(c<0xd800 || c>=0xe000) {
  308. int lead=c>>12;
  309. uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
  310. if(twoBits<=1) {
  311. // All 64 code points with the same bits 15..6
  312. // are either in the set or not.
  313. if(twoBits==0) {
  314. break;
  315. }
  316. } else {
  317. // Look up the code point in its 4k block of code points.
  318. if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
  319. break;
  320. }
  321. }
  322. } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
  323. // surrogate code point
  324. if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
  325. break;
  326. }
  327. } else {
  328. // surrogate pair
  329. if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
  330. break;
  331. }
  332. ++s;
  333. }
  334. } while(++s<limit);
  335. } else {
  336. // span not
  337. do {
  338. c=*s;
  339. if(c<=0xff) {
  340. if(latin1Contains[c]) {
  341. break;
  342. }
  343. } else if(c<=0x7ff) {
  344. if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
  345. break;
  346. }
  347. } else if(c<0xd800 || c>=0xe000) {
  348. int lead=c>>12;
  349. uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
  350. if(twoBits<=1) {
  351. // All 64 code points with the same bits 15..6
  352. // are either in the set or not.
  353. if(twoBits!=0) {
  354. break;
  355. }
  356. } else {
  357. // Look up the code point in its 4k block of code points.
  358. if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
  359. break;
  360. }
  361. }
  362. } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
  363. // surrogate code point
  364. if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
  365. break;
  366. }
  367. } else {
  368. // surrogate pair
  369. if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
  370. break;
  371. }
  372. ++s;
  373. }
  374. } while(++s<limit);
  375. }
  376. return s;
  377. }
  378. /* Symmetrical with span(). */
  379. const char16_t *
  380. BMPSet::spanBack(const char16_t *s, const char16_t *limit, USetSpanCondition spanCondition) const {
  381. char16_t c, c2;
  382. if(spanCondition) {
  383. // span
  384. for(;;) {
  385. c=*(--limit);
  386. if(c<=0xff) {
  387. if(!latin1Contains[c]) {
  388. break;
  389. }
  390. } else if(c<=0x7ff) {
  391. if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
  392. break;
  393. }
  394. } else if(c<0xd800 || c>=0xe000) {
  395. int lead=c>>12;
  396. uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
  397. if(twoBits<=1) {
  398. // All 64 code points with the same bits 15..6
  399. // are either in the set or not.
  400. if(twoBits==0) {
  401. break;
  402. }
  403. } else {
  404. // Look up the code point in its 4k block of code points.
  405. if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
  406. break;
  407. }
  408. }
  409. } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
  410. // surrogate code point
  411. if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
  412. break;
  413. }
  414. } else {
  415. // surrogate pair
  416. if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
  417. break;
  418. }
  419. --limit;
  420. }
  421. if(s==limit) {
  422. return s;
  423. }
  424. }
  425. } else {
  426. // span not
  427. for(;;) {
  428. c=*(--limit);
  429. if(c<=0xff) {
  430. if(latin1Contains[c]) {
  431. break;
  432. }
  433. } else if(c<=0x7ff) {
  434. if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
  435. break;
  436. }
  437. } else if(c<0xd800 || c>=0xe000) {
  438. int lead=c>>12;
  439. uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
  440. if(twoBits<=1) {
  441. // All 64 code points with the same bits 15..6
  442. // are either in the set or not.
  443. if(twoBits!=0) {
  444. break;
  445. }
  446. } else {
  447. // Look up the code point in its 4k block of code points.
  448. if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
  449. break;
  450. }
  451. }
  452. } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
  453. // surrogate code point
  454. if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
  455. break;
  456. }
  457. } else {
  458. // surrogate pair
  459. if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
  460. break;
  461. }
  462. --limit;
  463. }
  464. if(s==limit) {
  465. return s;
  466. }
  467. }
  468. }
  469. return limit+1;
  470. }
  471. /*
  472. * Precheck for sufficient trail bytes at end of string only once per span.
  473. * Check validity.
  474. */
  475. const uint8_t *
  476. BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
  477. const uint8_t *limit=s+length;
  478. uint8_t b=*s;
  479. if(U8_IS_SINGLE(b)) {
  480. // Initial all-ASCII span.
  481. if(spanCondition) {
  482. do {
  483. if(!latin1Contains[b] || ++s==limit) {
  484. return s;
  485. }
  486. b=*s;
  487. } while(U8_IS_SINGLE(b));
  488. } else {
  489. do {
  490. if(latin1Contains[b] || ++s==limit) {
  491. return s;
  492. }
  493. b=*s;
  494. } while(U8_IS_SINGLE(b));
  495. }
  496. length=(int32_t)(limit-s);
  497. }
  498. if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
  499. spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
  500. }
  501. const uint8_t *limit0=limit;
  502. /*
  503. * Make sure that the last 1/2/3/4-byte sequence before limit is complete
  504. * or runs into a lead byte.
  505. * In the span loop compare s with limit only once
  506. * per multi-byte character.
  507. *
  508. * Give a trailing illegal sequence the same value as the result of contains(FFFD),
  509. * including it if that is part of the span, otherwise set limit0 to before
  510. * the truncated sequence.
  511. */
  512. b=*(limit-1);
  513. if((int8_t)b<0) {
  514. // b>=0x80: lead or trail byte
  515. if(b<0xc0) {
  516. // single trail byte, check for preceding 3- or 4-byte lead byte
  517. if(length>=2 && (b=*(limit-2))>=0xe0) {
  518. limit-=2;
  519. if(containsFFFD!=spanCondition) {
  520. limit0=limit;
  521. }
  522. } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
  523. // 4-byte lead byte with only two trail bytes
  524. limit-=3;
  525. if(containsFFFD!=spanCondition) {
  526. limit0=limit;
  527. }
  528. }
  529. } else {
  530. // lead byte with no trail bytes
  531. --limit;
  532. if(containsFFFD!=spanCondition) {
  533. limit0=limit;
  534. }
  535. }
  536. }
  537. uint8_t t1, t2, t3;
  538. while(s<limit) {
  539. b=*s;
  540. if(U8_IS_SINGLE(b)) {
  541. // ASCII
  542. if(spanCondition) {
  543. do {
  544. if(!latin1Contains[b]) {
  545. return s;
  546. } else if(++s==limit) {
  547. return limit0;
  548. }
  549. b=*s;
  550. } while(U8_IS_SINGLE(b));
  551. } else {
  552. do {
  553. if(latin1Contains[b]) {
  554. return s;
  555. } else if(++s==limit) {
  556. return limit0;
  557. }
  558. b=*s;
  559. } while(U8_IS_SINGLE(b));
  560. }
  561. }
  562. ++s; // Advance past the lead byte.
  563. if(b>=0xe0) {
  564. if(b<0xf0) {
  565. if( /* handle U+0000..U+FFFF inline */
  566. (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
  567. (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
  568. ) {
  569. b&=0xf;
  570. uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
  571. if(twoBits<=1) {
  572. // All 64 code points with this lead byte and middle trail byte
  573. // are either in the set or not.
  574. if(twoBits!=(uint32_t)spanCondition) {
  575. return s-1;
  576. }
  577. } else {
  578. // Look up the code point in its 4k block of code points.
  579. UChar32 c=(b<<12)|(t1<<6)|t2;
  580. if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
  581. return s-1;
  582. }
  583. }
  584. s+=2;
  585. continue;
  586. }
  587. } else if( /* handle U+10000..U+10FFFF inline */
  588. (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
  589. (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
  590. (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
  591. ) {
  592. // Give an illegal sequence the same value as the result of contains(FFFD).
  593. UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
  594. if( ( (0x10000<=c && c<=0x10ffff) ?
  595. containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
  596. containsFFFD
  597. ) != spanCondition
  598. ) {
  599. return s-1;
  600. }
  601. s+=3;
  602. continue;
  603. }
  604. } else {
  605. if( /* handle U+0000..U+07FF inline */
  606. b>=0xc0 &&
  607. (t1=(uint8_t)(*s-0x80)) <= 0x3f
  608. ) {
  609. if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
  610. return s-1;
  611. }
  612. ++s;
  613. continue;
  614. }
  615. }
  616. // Give an illegal sequence the same value as the result of contains(FFFD).
  617. // Handle each byte of an illegal sequence separately to simplify the code;
  618. // no need to optimize error handling.
  619. if(containsFFFD!=spanCondition) {
  620. return s-1;
  621. }
  622. }
  623. return limit0;
  624. }
  625. /*
  626. * While going backwards through UTF-8 optimize only for ASCII.
  627. * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
  628. * possible to tell from the last byte in a multi-byte sequence how many
  629. * preceding bytes there should be. Therefore, going backwards through UTF-8
  630. * is much harder than going forward.
  631. */
  632. int32_t
  633. BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
  634. if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
  635. spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
  636. }
  637. uint8_t b;
  638. do {
  639. b=s[--length];
  640. if(U8_IS_SINGLE(b)) {
  641. // ASCII sub-span
  642. if(spanCondition) {
  643. do {
  644. if(!latin1Contains[b]) {
  645. return length+1;
  646. } else if(length==0) {
  647. return 0;
  648. }
  649. b=s[--length];
  650. } while(U8_IS_SINGLE(b));
  651. } else {
  652. do {
  653. if(latin1Contains[b]) {
  654. return length+1;
  655. } else if(length==0) {
  656. return 0;
  657. }
  658. b=s[--length];
  659. } while(U8_IS_SINGLE(b));
  660. }
  661. }
  662. int32_t prev=length;
  663. UChar32 c;
  664. // trail byte: collect a multi-byte character
  665. // (or lead byte in last-trail position)
  666. c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
  667. // c is a valid code point, not ASCII, not a surrogate
  668. if(c<=0x7ff) {
  669. if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
  670. return prev+1;
  671. }
  672. } else if(c<=0xffff) {
  673. int lead=c>>12;
  674. uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
  675. if(twoBits<=1) {
  676. // All 64 code points with the same bits 15..6
  677. // are either in the set or not.
  678. if(twoBits!=(uint32_t)spanCondition) {
  679. return prev+1;
  680. }
  681. } else {
  682. // Look up the code point in its 4k block of code points.
  683. if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
  684. return prev+1;
  685. }
  686. }
  687. } else {
  688. if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
  689. return prev+1;
  690. }
  691. }
  692. } while(length>0);
  693. return 0;
  694. }
  695. U_NAMESPACE_END