123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420 |
- // © 2018 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- // characterproperties.cpp
- // created: 2018sep03 Markus W. Scherer
- #include "unicode/utypes.h"
- #include "unicode/localpointer.h"
- #include "unicode/uchar.h"
- #include "unicode/ucpmap.h"
- #include "unicode/ucptrie.h"
- #include "unicode/umutablecptrie.h"
- #include "unicode/uniset.h"
- #include "unicode/uscript.h"
- #include "unicode/uset.h"
- #include "cmemory.h"
- #include "emojiprops.h"
- #include "mutex.h"
- #include "normalizer2impl.h"
- #include "uassert.h"
- #include "ubidi_props.h"
- #include "ucase.h"
- #include "ucln_cmn.h"
- #include "umutex.h"
- #include "uprops.h"
- using icu::LocalPointer;
- #if !UCONFIG_NO_NORMALIZATION
- using icu::Normalizer2Factory;
- using icu::Normalizer2Impl;
- #endif
- using icu::UInitOnce;
- using icu::UnicodeSet;
- namespace {
- UBool U_CALLCONV characterproperties_cleanup();
- constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + (UCHAR_INT_LIMIT - UCHAR_INT_START);
- struct Inclusion {
- UnicodeSet *fSet = nullptr;
- UInitOnce fInitOnce {};
- };
- Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions()
- UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {};
- UCPMap *maps[UCHAR_INT_LIMIT - UCHAR_INT_START] = {};
- icu::UMutex cpMutex;
- //----------------------------------------------------------------
- // Inclusions list
- //----------------------------------------------------------------
- // USetAdder implementation
- // Does not use uset.h to reduce code dependencies
- void U_CALLCONV
- _set_add(USet *set, UChar32 c) {
- ((UnicodeSet *)set)->add(c);
- }
- void U_CALLCONV
- _set_addRange(USet *set, UChar32 start, UChar32 end) {
- ((UnicodeSet *)set)->add(start, end);
- }
- void U_CALLCONV
- _set_addString(USet *set, const char16_t *str, int32_t length) {
- ((UnicodeSet *)set)->add(icu::UnicodeString((UBool)(length<0), str, length));
- }
- UBool U_CALLCONV characterproperties_cleanup() {
- for (Inclusion &in: gInclusions) {
- delete in.fSet;
- in.fSet = nullptr;
- in.fInitOnce.reset();
- }
- for (int32_t i = 0; i < UPRV_LENGTHOF(sets); ++i) {
- delete sets[i];
- sets[i] = nullptr;
- }
- for (int32_t i = 0; i < UPRV_LENGTHOF(maps); ++i) {
- ucptrie_close(reinterpret_cast<UCPTrie *>(maps[i]));
- maps[i] = nullptr;
- }
- return true;
- }
- void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
- // This function is invoked only via umtx_initOnce().
- U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT);
- if (src == UPROPS_SRC_NONE) {
- errorCode = U_INTERNAL_PROGRAM_ERROR;
- return;
- }
- U_ASSERT(gInclusions[src].fSet == nullptr);
- LocalPointer<UnicodeSet> incl(new UnicodeSet());
- if (incl.isNull()) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- USetAdder sa = {
- (USet *)incl.getAlias(),
- _set_add,
- _set_addRange,
- _set_addString,
- nullptr, // don't need remove()
- nullptr // don't need removeRange()
- };
- switch(src) {
- case UPROPS_SRC_CHAR:
- uchar_addPropertyStarts(&sa, &errorCode);
- break;
- case UPROPS_SRC_PROPSVEC:
- upropsvec_addPropertyStarts(&sa, &errorCode);
- break;
- case UPROPS_SRC_CHAR_AND_PROPSVEC:
- uchar_addPropertyStarts(&sa, &errorCode);
- upropsvec_addPropertyStarts(&sa, &errorCode);
- break;
- #if !UCONFIG_NO_NORMALIZATION
- case UPROPS_SRC_CASE_AND_NORM: {
- const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
- if(U_SUCCESS(errorCode)) {
- impl->addPropertyStarts(&sa, errorCode);
- }
- ucase_addPropertyStarts(&sa, &errorCode);
- break;
- }
- case UPROPS_SRC_NFC: {
- const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
- if(U_SUCCESS(errorCode)) {
- impl->addPropertyStarts(&sa, errorCode);
- }
- break;
- }
- case UPROPS_SRC_NFKC: {
- const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(errorCode);
- if(U_SUCCESS(errorCode)) {
- impl->addPropertyStarts(&sa, errorCode);
- }
- break;
- }
- case UPROPS_SRC_NFKC_CF: {
- const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(errorCode);
- if(U_SUCCESS(errorCode)) {
- impl->addPropertyStarts(&sa, errorCode);
- }
- break;
- }
- case UPROPS_SRC_NFC_CANON_ITER: {
- const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
- if(U_SUCCESS(errorCode)) {
- impl->addCanonIterPropertyStarts(&sa, errorCode);
- }
- break;
- }
- #endif
- case UPROPS_SRC_CASE:
- ucase_addPropertyStarts(&sa, &errorCode);
- break;
- case UPROPS_SRC_BIDI:
- ubidi_addPropertyStarts(&sa, &errorCode);
- break;
- case UPROPS_SRC_INPC:
- case UPROPS_SRC_INSC:
- case UPROPS_SRC_VO:
- uprops_addPropertyStarts((UPropertySource)src, &sa, &errorCode);
- break;
- case UPROPS_SRC_EMOJI: {
- const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
- if (U_SUCCESS(errorCode)) {
- ep->addPropertyStarts(&sa, errorCode);
- }
- break;
- }
- default:
- errorCode = U_INTERNAL_PROGRAM_ERROR;
- break;
- }
- if (U_FAILURE(errorCode)) {
- return;
- }
- if (incl->isBogus()) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- // Compact for caching.
- incl->compact();
- gInclusions[src].fSet = incl.orphan();
- ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
- }
- const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) { return nullptr; }
- if (src < 0 || UPROPS_SRC_COUNT <= src) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- Inclusion &i = gInclusions[src];
- umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode);
- return i.fSet;
- }
- void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) {
- // This function is invoked only via umtx_initOnce().
- U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT);
- int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
- U_ASSERT(gInclusions[inclIndex].fSet == nullptr);
- UPropertySource src = uprops_getSource(prop);
- const UnicodeSet *incl = getInclusionsForSource(src, errorCode);
- if (U_FAILURE(errorCode)) {
- return;
- }
- LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0));
- if (intPropIncl.isNull()) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- int32_t numRanges = incl->getRangeCount();
- int32_t prevValue = 0;
- for (int32_t i = 0; i < numRanges; ++i) {
- UChar32 rangeEnd = incl->getRangeEnd(i);
- for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) {
- // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
- int32_t value = u_getIntPropertyValue(c, prop);
- if (value != prevValue) {
- intPropIncl->add(c);
- prevValue = value;
- }
- }
- }
- if (intPropIncl->isBogus()) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- // Compact for caching.
- intPropIncl->compact();
- gInclusions[inclIndex].fSet = intPropIncl.orphan();
- ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup);
- }
- } // namespace
- U_NAMESPACE_BEGIN
- const UnicodeSet *CharacterProperties::getInclusionsForProperty(
- UProperty prop, UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) { return nullptr; }
- if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
- int32_t inclIndex = UPROPS_SRC_COUNT + (prop - UCHAR_INT_START);
- Inclusion &i = gInclusions[inclIndex];
- umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode);
- return i.fSet;
- } else {
- UPropertySource src = uprops_getSource(prop);
- return getInclusionsForSource(src, errorCode);
- }
- }
- U_NAMESPACE_END
- namespace {
- UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) { return nullptr; }
- LocalPointer<UnicodeSet> set(new UnicodeSet());
- if (set.isNull()) {
- errorCode = U_MEMORY_ALLOCATION_ERROR;
- return nullptr;
- }
- if (UCHAR_BASIC_EMOJI <= property && property <= UCHAR_RGI_EMOJI) {
- // property of strings
- const icu::EmojiProps *ep = icu::EmojiProps::getSingleton(errorCode);
- if (U_FAILURE(errorCode)) { return nullptr; }
- USetAdder sa = {
- (USet *)set.getAlias(),
- _set_add,
- _set_addRange,
- _set_addString,
- nullptr, // don't need remove()
- nullptr // don't need removeRange()
- };
- ep->addStrings(&sa, property, errorCode);
- if (property != UCHAR_BASIC_EMOJI && property != UCHAR_RGI_EMOJI) {
- // property of _only_ strings
- set->freeze();
- return set.orphan();
- }
- }
- const UnicodeSet *inclusions =
- icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
- if (U_FAILURE(errorCode)) { return nullptr; }
- int32_t numRanges = inclusions->getRangeCount();
- UChar32 startHasProperty = -1;
- for (int32_t i = 0; i < numRanges; ++i) {
- UChar32 rangeEnd = inclusions->getRangeEnd(i);
- for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
- // TODO: Get a UCharacterProperty.BinaryProperty to avoid the property dispatch.
- if (u_hasBinaryProperty(c, property)) {
- if (startHasProperty < 0) {
- // Transition from false to true.
- startHasProperty = c;
- }
- } else if (startHasProperty >= 0) {
- // Transition from true to false.
- set->add(startHasProperty, c - 1);
- startHasProperty = -1;
- }
- }
- }
- if (startHasProperty >= 0) {
- set->add(startHasProperty, 0x10FFFF);
- }
- set->freeze();
- return set.orphan();
- }
- UCPMap *makeMap(UProperty property, UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) { return nullptr; }
- uint32_t nullValue = property == UCHAR_SCRIPT ? USCRIPT_UNKNOWN : 0;
- icu::LocalUMutableCPTriePointer mutableTrie(
- umutablecptrie_open(nullValue, nullValue, &errorCode));
- const UnicodeSet *inclusions =
- icu::CharacterProperties::getInclusionsForProperty(property, errorCode);
- if (U_FAILURE(errorCode)) { return nullptr; }
- int32_t numRanges = inclusions->getRangeCount();
- UChar32 start = 0;
- uint32_t value = nullValue;
- for (int32_t i = 0; i < numRanges; ++i) {
- UChar32 rangeEnd = inclusions->getRangeEnd(i);
- for (UChar32 c = inclusions->getRangeStart(i); c <= rangeEnd; ++c) {
- // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch.
- uint32_t nextValue = u_getIntPropertyValue(c, property);
- if (value != nextValue) {
- if (value != nullValue) {
- umutablecptrie_setRange(mutableTrie.getAlias(), start, c - 1, value, &errorCode);
- }
- start = c;
- value = nextValue;
- }
- }
- }
- if (value != 0) {
- umutablecptrie_setRange(mutableTrie.getAlias(), start, 0x10FFFF, value, &errorCode);
- }
- UCPTrieType type;
- if (property == UCHAR_BIDI_CLASS || property == UCHAR_GENERAL_CATEGORY) {
- type = UCPTRIE_TYPE_FAST;
- } else {
- type = UCPTRIE_TYPE_SMALL;
- }
- UCPTrieValueWidth valueWidth;
- // TODO: UCharacterProperty.IntProperty
- int32_t max = u_getIntPropertyMaxValue(property);
- if (max <= 0xff) {
- valueWidth = UCPTRIE_VALUE_BITS_8;
- } else if (max <= 0xffff) {
- valueWidth = UCPTRIE_VALUE_BITS_16;
- } else {
- valueWidth = UCPTRIE_VALUE_BITS_32;
- }
- return reinterpret_cast<UCPMap *>(
- umutablecptrie_buildImmutable(mutableTrie.getAlias(), type, valueWidth, &errorCode));
- }
- } // namespace
- U_NAMESPACE_BEGIN
- const UnicodeSet *CharacterProperties::getBinaryPropertySet(UProperty property, UErrorCode &errorCode) {
- if (U_FAILURE(errorCode)) { return nullptr; }
- if (property < 0 || UCHAR_BINARY_LIMIT <= property) {
- errorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- Mutex m(&cpMutex);
- UnicodeSet *set = sets[property];
- if (set == nullptr) {
- sets[property] = set = makeSet(property, errorCode);
- }
- return set;
- }
- U_NAMESPACE_END
- U_NAMESPACE_USE
- U_CAPI const USet * U_EXPORT2
- u_getBinaryPropertySet(UProperty property, UErrorCode *pErrorCode) {
- const UnicodeSet *set = CharacterProperties::getBinaryPropertySet(property, *pErrorCode);
- return U_SUCCESS(*pErrorCode) ? set->toUSet() : nullptr;
- }
- U_CAPI const UCPMap * U_EXPORT2
- u_getIntPropertyMap(UProperty property, UErrorCode *pErrorCode) {
- if (U_FAILURE(*pErrorCode)) { return nullptr; }
- if (property < UCHAR_INT_START || UCHAR_INT_LIMIT <= property) {
- *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
- return nullptr;
- }
- Mutex m(&cpMutex);
- UCPMap *map = maps[property - UCHAR_INT_START];
- if (map == nullptr) {
- maps[property - UCHAR_INT_START] = map = makeMap(property, *pErrorCode);
- }
- return map;
- }
|