scriptset.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. **********************************************************************
  5. * Copyright (C) 2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. **********************************************************************
  8. *
  9. * scriptset.cpp
  10. *
  11. * created on: 2013 Jan 7
  12. * created by: Andy Heninger
  13. */
  14. #include "unicode/utypes.h"
  15. #include "unicode/uchar.h"
  16. #include "unicode/unistr.h"
  17. #include "scriptset.h"
  18. #include "uassert.h"
  19. #include "cmemory.h"
  20. U_NAMESPACE_BEGIN
  21. //----------------------------------------------------------------------------
  22. //
  23. // ScriptSet implementation
  24. //
  25. //----------------------------------------------------------------------------
  26. ScriptSet::ScriptSet() {
  27. uprv_memset(bits, 0, sizeof(bits));
  28. }
  29. ScriptSet::~ScriptSet() {
  30. }
  31. ScriptSet::ScriptSet(const ScriptSet &other) {
  32. *this = other;
  33. }
  34. ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
  35. uprv_memcpy(bits, other.bits, sizeof(bits));
  36. return *this;
  37. }
  38. bool ScriptSet::operator == (const ScriptSet &other) const {
  39. for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
  40. if (bits[i] != other.bits[i]) {
  41. return false;
  42. }
  43. }
  44. return true;
  45. }
  46. UBool ScriptSet::test(UScriptCode script, UErrorCode &status) const {
  47. if (U_FAILURE(status)) {
  48. return false;
  49. }
  50. if (script < 0 || (int32_t)script >= SCRIPT_LIMIT) {
  51. status = U_ILLEGAL_ARGUMENT_ERROR;
  52. return false;
  53. }
  54. uint32_t index = script / 32;
  55. uint32_t bit = 1 << (script & 31);
  56. return ((bits[index] & bit) != 0);
  57. }
  58. ScriptSet &ScriptSet::set(UScriptCode script, UErrorCode &status) {
  59. if (U_FAILURE(status)) {
  60. return *this;
  61. }
  62. if (script < 0 || (int32_t)script >= SCRIPT_LIMIT) {
  63. status = U_ILLEGAL_ARGUMENT_ERROR;
  64. return *this;
  65. }
  66. uint32_t index = script / 32;
  67. uint32_t bit = 1 << (script & 31);
  68. bits[index] |= bit;
  69. return *this;
  70. }
  71. ScriptSet &ScriptSet::reset(UScriptCode script, UErrorCode &status) {
  72. if (U_FAILURE(status)) {
  73. return *this;
  74. }
  75. if (script < 0 || (int32_t)script >= SCRIPT_LIMIT) {
  76. status = U_ILLEGAL_ARGUMENT_ERROR;
  77. return *this;
  78. }
  79. uint32_t index = script / 32;
  80. uint32_t bit = 1 << (script & 31);
  81. bits[index] &= ~bit;
  82. return *this;
  83. }
  84. ScriptSet &ScriptSet::Union(const ScriptSet &other) {
  85. for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
  86. bits[i] |= other.bits[i];
  87. }
  88. return *this;
  89. }
  90. ScriptSet &ScriptSet::intersect(const ScriptSet &other) {
  91. for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
  92. bits[i] &= other.bits[i];
  93. }
  94. return *this;
  95. }
  96. ScriptSet &ScriptSet::intersect(UScriptCode script, UErrorCode &status) {
  97. ScriptSet t;
  98. t.set(script, status);
  99. if (U_SUCCESS(status)) {
  100. this->intersect(t);
  101. }
  102. return *this;
  103. }
  104. UBool ScriptSet::intersects(const ScriptSet &other) const {
  105. for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
  106. if ((bits[i] & other.bits[i]) != 0) {
  107. return true;
  108. }
  109. }
  110. return false;
  111. }
  112. UBool ScriptSet::contains(const ScriptSet &other) const {
  113. ScriptSet t(*this);
  114. t.intersect(other);
  115. return (t == other);
  116. }
  117. ScriptSet &ScriptSet::setAll() {
  118. for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
  119. bits[i] = 0xffffffffu;
  120. }
  121. return *this;
  122. }
  123. ScriptSet &ScriptSet::resetAll() {
  124. uprv_memset(bits, 0, sizeof(bits));
  125. return *this;
  126. }
  127. int32_t ScriptSet::countMembers() const {
  128. // This bit counter is good for sparse numbers of '1's, which is
  129. // very much the case that we will usually have.
  130. int32_t count = 0;
  131. for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
  132. uint32_t x = bits[i];
  133. while (x > 0) {
  134. count++;
  135. x &= (x - 1); // and off the least significant one bit.
  136. }
  137. }
  138. return count;
  139. }
  140. int32_t ScriptSet::hashCode() const {
  141. int32_t hash = 0;
  142. for (int32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
  143. hash ^= bits[i];
  144. }
  145. return hash;
  146. }
  147. int32_t ScriptSet::nextSetBit(int32_t fromIndex) const {
  148. // TODO: Wants a better implementation.
  149. if (fromIndex < 0) {
  150. return -1;
  151. }
  152. UErrorCode status = U_ZERO_ERROR;
  153. for (int32_t scriptIndex = fromIndex; scriptIndex < SCRIPT_LIMIT; scriptIndex++) {
  154. if (test((UScriptCode)scriptIndex, status)) {
  155. return scriptIndex;
  156. }
  157. }
  158. return -1;
  159. }
  160. UBool ScriptSet::isEmpty() const {
  161. for (uint32_t i=0; i<UPRV_LENGTHOF(bits); i++) {
  162. if (bits[i] != 0) {
  163. return false;
  164. }
  165. }
  166. return true;
  167. }
  168. UnicodeString &ScriptSet::displayScripts(UnicodeString &dest) const {
  169. UBool firstTime = true;
  170. for (int32_t i = nextSetBit(0); i >= 0; i = nextSetBit(i + 1)) {
  171. if (!firstTime) {
  172. dest.append((char16_t)0x20);
  173. }
  174. firstTime = false;
  175. const char *scriptName = uscript_getShortName((UScriptCode(i)));
  176. dest.append(UnicodeString(scriptName, -1, US_INV));
  177. }
  178. return dest;
  179. }
  180. ScriptSet &ScriptSet::parseScripts(const UnicodeString &scriptString, UErrorCode &status) {
  181. resetAll();
  182. if (U_FAILURE(status)) {
  183. return *this;
  184. }
  185. UnicodeString oneScriptName;
  186. for (int32_t i=0; i<scriptString.length();) {
  187. UChar32 c = scriptString.char32At(i);
  188. i = scriptString.moveIndex32(i, 1);
  189. if (!u_isUWhiteSpace(c)) {
  190. oneScriptName.append(c);
  191. if (i < scriptString.length()) {
  192. continue;
  193. }
  194. }
  195. if (oneScriptName.length() > 0) {
  196. char buf[40];
  197. oneScriptName.extract(0, oneScriptName.length(), buf, sizeof(buf)-1, US_INV);
  198. buf[sizeof(buf)-1] = 0;
  199. int32_t sc = u_getPropertyValueEnum(UCHAR_SCRIPT, buf);
  200. if (sc == UCHAR_INVALID_CODE) {
  201. status = U_ILLEGAL_ARGUMENT_ERROR;
  202. } else {
  203. this->set((UScriptCode)sc, status);
  204. }
  205. if (U_FAILURE(status)) {
  206. return *this;
  207. }
  208. oneScriptName.remove();
  209. }
  210. }
  211. return *this;
  212. }
  213. void ScriptSet::setScriptExtensions(UChar32 codePoint, UErrorCode& status) {
  214. if (U_FAILURE(status)) { return; }
  215. static const int32_t FIRST_GUESS_SCRIPT_CAPACITY = 20;
  216. MaybeStackArray<UScriptCode,FIRST_GUESS_SCRIPT_CAPACITY> scripts;
  217. UErrorCode internalStatus = U_ZERO_ERROR;
  218. int32_t script_count = -1;
  219. while (true) {
  220. script_count = uscript_getScriptExtensions(
  221. codePoint, scripts.getAlias(), scripts.getCapacity(), &internalStatus);
  222. if (internalStatus == U_BUFFER_OVERFLOW_ERROR) {
  223. // Need to allocate more space
  224. if (scripts.resize(script_count) == nullptr) {
  225. status = U_MEMORY_ALLOCATION_ERROR;
  226. return;
  227. }
  228. internalStatus = U_ZERO_ERROR;
  229. } else {
  230. break;
  231. }
  232. }
  233. // Check if we failed for some reason other than buffer overflow
  234. if (U_FAILURE(internalStatus)) {
  235. status = internalStatus;
  236. return;
  237. }
  238. // Load the scripts into the ScriptSet and return
  239. for (int32_t i = 0; i < script_count; i++) {
  240. this->set(scripts[i], status);
  241. if (U_FAILURE(status)) { return; }
  242. }
  243. }
  244. U_NAMESPACE_END
  245. U_CAPI UBool U_EXPORT2
  246. uhash_equalsScriptSet(const UElement key1, const UElement key2) {
  247. icu::ScriptSet *s1 = static_cast<icu::ScriptSet *>(key1.pointer);
  248. icu::ScriptSet *s2 = static_cast<icu::ScriptSet *>(key2.pointer);
  249. return (*s1 == *s2);
  250. }
  251. U_CAPI int8_t U_EXPORT2
  252. uhash_compareScriptSet(UElement key0, UElement key1) {
  253. icu::ScriptSet *s0 = static_cast<icu::ScriptSet *>(key0.pointer);
  254. icu::ScriptSet *s1 = static_cast<icu::ScriptSet *>(key1.pointer);
  255. int32_t diff = s0->countMembers() - s1->countMembers();
  256. if (diff != 0) return static_cast<UBool>(diff);
  257. int32_t i0 = s0->nextSetBit(0);
  258. int32_t i1 = s1->nextSetBit(0);
  259. while ((diff = i0-i1) == 0 && i0 > 0) {
  260. i0 = s0->nextSetBit(i0+1);
  261. i1 = s1->nextSetBit(i1+1);
  262. }
  263. return (int8_t)diff;
  264. }
  265. U_CAPI int32_t U_EXPORT2
  266. uhash_hashScriptSet(const UElement key) {
  267. icu::ScriptSet *s = static_cast<icu::ScriptSet *>(key.pointer);
  268. return s->hashCode();
  269. }
  270. U_CAPI void U_EXPORT2
  271. uhash_deleteScriptSet(void *obj) {
  272. icu::ScriptSet *s = static_cast<icu::ScriptSet *>(obj);
  273. delete s;
  274. }