brkiter.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 1997-2015, International Business Machines Corporation and
  6. * others. All Rights Reserved.
  7. *******************************************************************************
  8. *
  9. * File brkiter.cpp
  10. *
  11. * Modification History:
  12. *
  13. * Date Name Description
  14. * 02/18/97 aliu Converted from OpenClass. Added DONE.
  15. * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
  16. *****************************************************************************************
  17. */
  18. // *****************************************************************************
  19. // This file was generated from the java source file BreakIterator.java
  20. // *****************************************************************************
  21. #include "unicode/utypes.h"
  22. #if !UCONFIG_NO_BREAK_ITERATION
  23. #include "unicode/rbbi.h"
  24. #include "unicode/brkiter.h"
  25. #include "unicode/udata.h"
  26. #include "unicode/ures.h"
  27. #include "unicode/ustring.h"
  28. #include "unicode/filteredbrk.h"
  29. #include "bytesinkutil.h"
  30. #include "ucln_cmn.h"
  31. #include "cstring.h"
  32. #include "umutex.h"
  33. #include "servloc.h"
  34. #include "locbased.h"
  35. #include "uresimp.h"
  36. #include "uassert.h"
  37. #include "ubrkimpl.h"
  38. #include "utracimp.h"
  39. #include "charstr.h"
  40. // *****************************************************************************
  41. // class BreakIterator
  42. // This class implements methods for finding the location of boundaries in text.
  43. // Instances of BreakIterator maintain a current position and scan over text
  44. // returning the index of characters where boundaries occur.
  45. // *****************************************************************************
  46. U_NAMESPACE_BEGIN
  47. // -------------------------------------
  48. BreakIterator*
  49. BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
  50. {
  51. char fnbuff[256];
  52. char ext[4]={'\0'};
  53. CharString actualLocale;
  54. int32_t size;
  55. const char16_t* brkfname = nullptr;
  56. UResourceBundle brkRulesStack;
  57. UResourceBundle brkNameStack;
  58. UResourceBundle *brkRules = &brkRulesStack;
  59. UResourceBundle *brkName = &brkNameStack;
  60. RuleBasedBreakIterator *result = nullptr;
  61. if (U_FAILURE(status))
  62. return nullptr;
  63. ures_initStackObject(brkRules);
  64. ures_initStackObject(brkName);
  65. // Get the locale
  66. UResourceBundle *b = ures_openNoDefault(U_ICUDATA_BRKITR, loc.getName(), &status);
  67. // Get the "boundaries" array.
  68. if (U_SUCCESS(status)) {
  69. brkRules = ures_getByKeyWithFallback(b, "boundaries", brkRules, &status);
  70. // Get the string object naming the rules file
  71. brkName = ures_getByKeyWithFallback(brkRules, type, brkName, &status);
  72. // Get the actual string
  73. brkfname = ures_getString(brkName, &size, &status);
  74. U_ASSERT((size_t)size<sizeof(fnbuff));
  75. if ((size_t)size>=sizeof(fnbuff)) {
  76. size=0;
  77. if (U_SUCCESS(status)) {
  78. status = U_BUFFER_OVERFLOW_ERROR;
  79. }
  80. }
  81. // Use the string if we found it
  82. if (U_SUCCESS(status) && brkfname) {
  83. actualLocale.append(ures_getLocaleInternal(brkName, &status), -1, status);
  84. char16_t* extStart=u_strchr(brkfname, 0x002e);
  85. int len = 0;
  86. if (extStart != nullptr){
  87. len = (int)(extStart-brkfname);
  88. u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff
  89. u_UCharsToChars(brkfname, fnbuff, len);
  90. }
  91. fnbuff[len]=0; // nul terminate
  92. }
  93. }
  94. ures_close(brkRules);
  95. ures_close(brkName);
  96. UDataMemory* file = udata_open(U_ICUDATA_BRKITR, ext, fnbuff, &status);
  97. if (U_FAILURE(status)) {
  98. ures_close(b);
  99. return nullptr;
  100. }
  101. // Create a RuleBasedBreakIterator
  102. result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != nullptr, status);
  103. // If there is a result, set the valid locale and actual locale, and the kind
  104. if (U_SUCCESS(status) && result != nullptr) {
  105. U_LOCALE_BASED(locBased, *(BreakIterator*)result);
  106. locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
  107. actualLocale.data());
  108. }
  109. ures_close(b);
  110. if (U_FAILURE(status) && result != nullptr) { // Sometimes redundant check, but simple
  111. delete result;
  112. return nullptr;
  113. }
  114. if (result == nullptr) {
  115. udata_close(file);
  116. if (U_SUCCESS(status)) {
  117. status = U_MEMORY_ALLOCATION_ERROR;
  118. }
  119. }
  120. return result;
  121. }
  122. // Creates a break iterator for word breaks.
  123. BreakIterator* U_EXPORT2
  124. BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
  125. {
  126. return createInstance(key, UBRK_WORD, status);
  127. }
  128. // -------------------------------------
  129. // Creates a break iterator for line breaks.
  130. BreakIterator* U_EXPORT2
  131. BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
  132. {
  133. return createInstance(key, UBRK_LINE, status);
  134. }
  135. // -------------------------------------
  136. // Creates a break iterator for character breaks.
  137. BreakIterator* U_EXPORT2
  138. BreakIterator::createCharacterInstance(const Locale& key, UErrorCode& status)
  139. {
  140. return createInstance(key, UBRK_CHARACTER, status);
  141. }
  142. // -------------------------------------
  143. // Creates a break iterator for sentence breaks.
  144. BreakIterator* U_EXPORT2
  145. BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
  146. {
  147. return createInstance(key, UBRK_SENTENCE, status);
  148. }
  149. // -------------------------------------
  150. // Creates a break iterator for title casing breaks.
  151. BreakIterator* U_EXPORT2
  152. BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
  153. {
  154. return createInstance(key, UBRK_TITLE, status);
  155. }
  156. // -------------------------------------
  157. // Gets all the available locales that has localized text boundary data.
  158. const Locale* U_EXPORT2
  159. BreakIterator::getAvailableLocales(int32_t& count)
  160. {
  161. return Locale::getAvailableLocales(count);
  162. }
  163. // ------------------------------------------
  164. //
  165. // Constructors, destructor and assignment operator
  166. //
  167. //-------------------------------------------
  168. BreakIterator::BreakIterator()
  169. {
  170. *validLocale = *actualLocale = 0;
  171. }
  172. BreakIterator::BreakIterator(const BreakIterator &other) : UObject(other) {
  173. uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
  174. uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
  175. }
  176. BreakIterator &BreakIterator::operator =(const BreakIterator &other) {
  177. if (this != &other) {
  178. uprv_strncpy(actualLocale, other.actualLocale, sizeof(actualLocale));
  179. uprv_strncpy(validLocale, other.validLocale, sizeof(validLocale));
  180. }
  181. return *this;
  182. }
  183. BreakIterator::~BreakIterator()
  184. {
  185. }
  186. // ------------------------------------------
  187. //
  188. // Registration
  189. //
  190. //-------------------------------------------
  191. #if !UCONFIG_NO_SERVICE
  192. // -------------------------------------
  193. class ICUBreakIteratorFactory : public ICUResourceBundleFactory {
  194. public:
  195. virtual ~ICUBreakIteratorFactory();
  196. protected:
  197. virtual UObject* handleCreate(const Locale& loc, int32_t kind, const ICUService* /*service*/, UErrorCode& status) const override {
  198. return BreakIterator::makeInstance(loc, kind, status);
  199. }
  200. };
  201. ICUBreakIteratorFactory::~ICUBreakIteratorFactory() {}
  202. // -------------------------------------
  203. class ICUBreakIteratorService : public ICULocaleService {
  204. public:
  205. ICUBreakIteratorService()
  206. : ICULocaleService(UNICODE_STRING("Break Iterator", 14))
  207. {
  208. UErrorCode status = U_ZERO_ERROR;
  209. registerFactory(new ICUBreakIteratorFactory(), status);
  210. }
  211. virtual ~ICUBreakIteratorService();
  212. virtual UObject* cloneInstance(UObject* instance) const override {
  213. return ((BreakIterator*)instance)->clone();
  214. }
  215. virtual UObject* handleDefault(const ICUServiceKey& key, UnicodeString* /*actualID*/, UErrorCode& status) const override {
  216. LocaleKey& lkey = static_cast<LocaleKey&>(const_cast<ICUServiceKey&>(key));
  217. int32_t kind = lkey.kind();
  218. Locale loc;
  219. lkey.currentLocale(loc);
  220. return BreakIterator::makeInstance(loc, kind, status);
  221. }
  222. virtual UBool isDefault() const override {
  223. return countFactories() == 1;
  224. }
  225. };
  226. ICUBreakIteratorService::~ICUBreakIteratorService() {}
  227. // -------------------------------------
  228. // defined in ucln_cmn.h
  229. U_NAMESPACE_END
  230. static icu::UInitOnce gInitOnceBrkiter {};
  231. static icu::ICULocaleService* gService = nullptr;
  232. /**
  233. * Release all static memory held by breakiterator.
  234. */
  235. U_CDECL_BEGIN
  236. static UBool U_CALLCONV breakiterator_cleanup() {
  237. #if !UCONFIG_NO_SERVICE
  238. if (gService) {
  239. delete gService;
  240. gService = nullptr;
  241. }
  242. gInitOnceBrkiter.reset();
  243. #endif
  244. return true;
  245. }
  246. U_CDECL_END
  247. U_NAMESPACE_BEGIN
  248. static void U_CALLCONV
  249. initService() {
  250. gService = new ICUBreakIteratorService();
  251. ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR, breakiterator_cleanup);
  252. }
  253. static ICULocaleService*
  254. getService()
  255. {
  256. umtx_initOnce(gInitOnceBrkiter, &initService);
  257. return gService;
  258. }
  259. // -------------------------------------
  260. static inline UBool
  261. hasService()
  262. {
  263. return !gInitOnceBrkiter.isReset() && getService() != nullptr;
  264. }
  265. // -------------------------------------
  266. URegistryKey U_EXPORT2
  267. BreakIterator::registerInstance(BreakIterator* toAdopt, const Locale& locale, UBreakIteratorType kind, UErrorCode& status)
  268. {
  269. ICULocaleService *service = getService();
  270. if (service == nullptr) {
  271. status = U_MEMORY_ALLOCATION_ERROR;
  272. return nullptr;
  273. }
  274. return service->registerInstance(toAdopt, locale, kind, status);
  275. }
  276. // -------------------------------------
  277. UBool U_EXPORT2
  278. BreakIterator::unregister(URegistryKey key, UErrorCode& status)
  279. {
  280. if (U_SUCCESS(status)) {
  281. if (hasService()) {
  282. return gService->unregister(key, status);
  283. }
  284. status = U_MEMORY_ALLOCATION_ERROR;
  285. }
  286. return false;
  287. }
  288. // -------------------------------------
  289. StringEnumeration* U_EXPORT2
  290. BreakIterator::getAvailableLocales()
  291. {
  292. ICULocaleService *service = getService();
  293. if (service == nullptr) {
  294. return nullptr;
  295. }
  296. return service->getAvailableLocales();
  297. }
  298. #endif /* UCONFIG_NO_SERVICE */
  299. // -------------------------------------
  300. BreakIterator*
  301. BreakIterator::createInstance(const Locale& loc, int32_t kind, UErrorCode& status)
  302. {
  303. if (U_FAILURE(status)) {
  304. return nullptr;
  305. }
  306. #if !UCONFIG_NO_SERVICE
  307. if (hasService()) {
  308. Locale actualLoc("");
  309. BreakIterator *result = (BreakIterator*)gService->get(loc, kind, &actualLoc, status);
  310. // TODO: The way the service code works in ICU 2.8 is that if
  311. // there is a real registered break iterator, the actualLoc
  312. // will be populated, but if the handleDefault path is taken
  313. // (because nothing is registered that can handle the
  314. // requested locale) then the actualLoc comes back empty. In
  315. // that case, the returned object already has its actual/valid
  316. // locale data populated (by makeInstance, which is what
  317. // handleDefault calls), so we don't touch it. YES, A COMMENT
  318. // THIS LONG is a sign of bad code -- so the action item is to
  319. // revisit this in ICU 3.0 and clean it up/fix it/remove it.
  320. if (U_SUCCESS(status) && (result != nullptr) && *actualLoc.getName() != 0) {
  321. U_LOCALE_BASED(locBased, *result);
  322. locBased.setLocaleIDs(actualLoc.getName(), actualLoc.getName());
  323. }
  324. return result;
  325. }
  326. else
  327. #endif
  328. {
  329. return makeInstance(loc, kind, status);
  330. }
  331. }
  332. // -------------------------------------
  333. enum { kKeyValueLenMax = 32 };
  334. BreakIterator*
  335. BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
  336. {
  337. if (U_FAILURE(status)) {
  338. return nullptr;
  339. }
  340. BreakIterator *result = nullptr;
  341. switch (kind) {
  342. case UBRK_CHARACTER:
  343. {
  344. UTRACE_ENTRY(UTRACE_UBRK_CREATE_CHARACTER);
  345. result = BreakIterator::buildInstance(loc, "grapheme", status);
  346. UTRACE_EXIT_STATUS(status);
  347. }
  348. break;
  349. case UBRK_WORD:
  350. {
  351. UTRACE_ENTRY(UTRACE_UBRK_CREATE_WORD);
  352. result = BreakIterator::buildInstance(loc, "word", status);
  353. UTRACE_EXIT_STATUS(status);
  354. }
  355. break;
  356. case UBRK_LINE:
  357. {
  358. char lb_lw[kKeyValueLenMax];
  359. UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
  360. uprv_strcpy(lb_lw, "line");
  361. UErrorCode kvStatus = U_ZERO_ERROR;
  362. CharString value;
  363. CharStringByteSink valueSink(&value);
  364. loc.getKeywordValue("lb", valueSink, kvStatus);
  365. if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
  366. uprv_strcat(lb_lw, "_");
  367. uprv_strcat(lb_lw, value.data());
  368. }
  369. // lw=phrase is only supported in Japanese and Korean
  370. if (uprv_strcmp(loc.getLanguage(), "ja") == 0 || uprv_strcmp(loc.getLanguage(), "ko") == 0) {
  371. value.clear();
  372. loc.getKeywordValue("lw", valueSink, kvStatus);
  373. if (U_SUCCESS(kvStatus) && value == "phrase") {
  374. uprv_strcat(lb_lw, "_");
  375. uprv_strcat(lb_lw, value.data());
  376. }
  377. }
  378. result = BreakIterator::buildInstance(loc, lb_lw, status);
  379. UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
  380. UTRACE_EXIT_STATUS(status);
  381. }
  382. break;
  383. case UBRK_SENTENCE:
  384. {
  385. UTRACE_ENTRY(UTRACE_UBRK_CREATE_SENTENCE);
  386. result = BreakIterator::buildInstance(loc, "sentence", status);
  387. #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  388. char ssKeyValue[kKeyValueLenMax] = {0};
  389. UErrorCode kvStatus = U_ZERO_ERROR;
  390. int32_t kLen = loc.getKeywordValue("ss", ssKeyValue, kKeyValueLenMax, kvStatus);
  391. if (U_SUCCESS(kvStatus) && kLen > 0 && uprv_strcmp(ssKeyValue,"standard")==0) {
  392. FilteredBreakIteratorBuilder* fbiBuilder = FilteredBreakIteratorBuilder::createInstance(loc, kvStatus);
  393. if (U_SUCCESS(kvStatus)) {
  394. result = fbiBuilder->build(result, status);
  395. delete fbiBuilder;
  396. }
  397. }
  398. #endif
  399. UTRACE_EXIT_STATUS(status);
  400. }
  401. break;
  402. case UBRK_TITLE:
  403. {
  404. UTRACE_ENTRY(UTRACE_UBRK_CREATE_TITLE);
  405. result = BreakIterator::buildInstance(loc, "title", status);
  406. UTRACE_EXIT_STATUS(status);
  407. }
  408. break;
  409. default:
  410. status = U_ILLEGAL_ARGUMENT_ERROR;
  411. }
  412. if (U_FAILURE(status)) {
  413. return nullptr;
  414. }
  415. return result;
  416. }
  417. Locale
  418. BreakIterator::getLocale(ULocDataLocaleType type, UErrorCode& status) const {
  419. U_LOCALE_BASED(locBased, *this);
  420. return locBased.getLocale(type, status);
  421. }
  422. const char *
  423. BreakIterator::getLocaleID(ULocDataLocaleType type, UErrorCode& status) const {
  424. U_LOCALE_BASED(locBased, *this);
  425. return locBased.getLocaleID(type, status);
  426. }
  427. // This implementation of getRuleStatus is a do-nothing stub, here to
  428. // provide a default implementation for any derived BreakIterator classes that
  429. // do not implement it themselves.
  430. int32_t BreakIterator::getRuleStatus() const {
  431. return 0;
  432. }
  433. // This implementation of getRuleStatusVec is a do-nothing stub, here to
  434. // provide a default implementation for any derived BreakIterator classes that
  435. // do not implement it themselves.
  436. int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status) {
  437. if (U_FAILURE(status)) {
  438. return 0;
  439. }
  440. if (capacity < 1) {
  441. status = U_BUFFER_OVERFLOW_ERROR;
  442. return 1;
  443. }
  444. *fillInVec = 0;
  445. return 1;
  446. }
  447. BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) {
  448. U_LOCALE_BASED(locBased, (*this));
  449. locBased.setLocaleIDs(valid, actual);
  450. }
  451. U_NAMESPACE_END
  452. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  453. //eof