casemap.h 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. // © 2017 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. // casemap.h
  4. // created: 2017jan12 Markus W. Scherer
  5. #ifndef __CASEMAP_H__
  6. #define __CASEMAP_H__
  7. #include "unicode/utypes.h"
  8. #if U_SHOW_CPLUSPLUS_API
  9. #include "unicode/stringpiece.h"
  10. #include "unicode/uobject.h"
  11. /**
  12. * \file
  13. * \brief C++ API: Low-level C++ case mapping functions.
  14. */
  15. U_NAMESPACE_BEGIN
  16. class BreakIterator;
  17. class ByteSink;
  18. class Edits;
  19. /**
  20. * Low-level C++ case mapping functions.
  21. *
  22. * @stable ICU 59
  23. */
  24. class U_COMMON_API CaseMap final : public UMemory {
  25. public:
  26. /**
  27. * Lowercases a UTF-16 string and optionally records edits.
  28. * Casing is locale-dependent and context-sensitive.
  29. * The result may be longer or shorter than the original.
  30. * The source string and the destination buffer must not overlap.
  31. *
  32. * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
  33. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  34. * @param src The original string.
  35. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  36. * @param dest A buffer for the result string. The result will be NUL-terminated if
  37. * the buffer is large enough.
  38. * The contents is undefined in case of failure.
  39. * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
  40. * dest may be nullptr and the function will only return the length of the result
  41. * without writing any of the result string.
  42. * @param edits Records edits for index mapping, working with styled text,
  43. * and getting only changes (if any).
  44. * The Edits contents is undefined if any error occurs.
  45. * This function calls edits->reset() first unless
  46. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  47. * @param errorCode Reference to an in/out error code value
  48. * which must not indicate a failure before the function call.
  49. * @return The length of the result string, if successful.
  50. * When the result would be longer than destCapacity,
  51. * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
  52. *
  53. * @see u_strToLower
  54. * @stable ICU 59
  55. */
  56. static int32_t toLower(
  57. const char *locale, uint32_t options,
  58. const char16_t *src, int32_t srcLength,
  59. char16_t *dest, int32_t destCapacity, Edits *edits,
  60. UErrorCode &errorCode);
  61. /**
  62. * Uppercases a UTF-16 string and optionally records edits.
  63. * Casing is locale-dependent and context-sensitive.
  64. * The result may be longer or shorter than the original.
  65. * The source string and the destination buffer must not overlap.
  66. *
  67. * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
  68. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  69. * @param src The original string.
  70. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  71. * @param dest A buffer for the result string. The result will be NUL-terminated if
  72. * the buffer is large enough.
  73. * The contents is undefined in case of failure.
  74. * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
  75. * dest may be nullptr and the function will only return the length of the result
  76. * without writing any of the result string.
  77. * @param edits Records edits for index mapping, working with styled text,
  78. * and getting only changes (if any).
  79. * The Edits contents is undefined if any error occurs.
  80. * This function calls edits->reset() first unless
  81. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  82. * @param errorCode Reference to an in/out error code value
  83. * which must not indicate a failure before the function call.
  84. * @return The length of the result string, if successful.
  85. * When the result would be longer than destCapacity,
  86. * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
  87. *
  88. * @see u_strToUpper
  89. * @stable ICU 59
  90. */
  91. static int32_t toUpper(
  92. const char *locale, uint32_t options,
  93. const char16_t *src, int32_t srcLength,
  94. char16_t *dest, int32_t destCapacity, Edits *edits,
  95. UErrorCode &errorCode);
  96. #if !UCONFIG_NO_BREAK_ITERATION
  97. /**
  98. * Titlecases a UTF-16 string and optionally records edits.
  99. * Casing is locale-dependent and context-sensitive.
  100. * The result may be longer or shorter than the original.
  101. * The source string and the destination buffer must not overlap.
  102. *
  103. * Titlecasing uses a break iterator to find the first characters of words
  104. * that are to be titlecased. It titlecases those characters and lowercases
  105. * all others. (This can be modified with options bits.)
  106. *
  107. * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
  108. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
  109. * U_TITLECASE_NO_LOWERCASE,
  110. * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
  111. * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
  112. * @param iter A break iterator to find the first characters of words that are to be titlecased.
  113. * It is set to the source string (setText())
  114. * and used one or more times for iteration (first() and next()).
  115. * If nullptr, then a word break iterator for the locale is used
  116. * (or something equivalent).
  117. * @param src The original string.
  118. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  119. * @param dest A buffer for the result string. The result will be NUL-terminated if
  120. * the buffer is large enough.
  121. * The contents is undefined in case of failure.
  122. * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
  123. * dest may be nullptr and the function will only return the length of the result
  124. * without writing any of the result string.
  125. * @param edits Records edits for index mapping, working with styled text,
  126. * and getting only changes (if any).
  127. * The Edits contents is undefined if any error occurs.
  128. * This function calls edits->reset() first unless
  129. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  130. * @param errorCode Reference to an in/out error code value
  131. * which must not indicate a failure before the function call.
  132. * @return The length of the result string, if successful.
  133. * When the result would be longer than destCapacity,
  134. * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
  135. *
  136. * @see u_strToTitle
  137. * @see ucasemap_toTitle
  138. * @stable ICU 59
  139. */
  140. static int32_t toTitle(
  141. const char *locale, uint32_t options, BreakIterator *iter,
  142. const char16_t *src, int32_t srcLength,
  143. char16_t *dest, int32_t destCapacity, Edits *edits,
  144. UErrorCode &errorCode);
  145. #endif // UCONFIG_NO_BREAK_ITERATION
  146. /**
  147. * Case-folds a UTF-16 string and optionally records edits.
  148. *
  149. * Case folding is locale-independent and not context-sensitive,
  150. * but there is an option for whether to include or exclude mappings for dotted I
  151. * and dotless i that are marked with 'T' in CaseFolding.txt.
  152. *
  153. * The result may be longer or shorter than the original.
  154. * The source string and the destination buffer must not overlap.
  155. *
  156. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
  157. * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
  158. * @param src The original string.
  159. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  160. * @param dest A buffer for the result string. The result will be NUL-terminated if
  161. * the buffer is large enough.
  162. * The contents is undefined in case of failure.
  163. * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
  164. * dest may be nullptr and the function will only return the length of the result
  165. * without writing any of the result string.
  166. * @param edits Records edits for index mapping, working with styled text,
  167. * and getting only changes (if any).
  168. * The Edits contents is undefined if any error occurs.
  169. * This function calls edits->reset() first unless
  170. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  171. * @param errorCode Reference to an in/out error code value
  172. * which must not indicate a failure before the function call.
  173. * @return The length of the result string, if successful.
  174. * When the result would be longer than destCapacity,
  175. * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
  176. *
  177. * @see u_strFoldCase
  178. * @stable ICU 59
  179. */
  180. static int32_t fold(
  181. uint32_t options,
  182. const char16_t *src, int32_t srcLength,
  183. char16_t *dest, int32_t destCapacity, Edits *edits,
  184. UErrorCode &errorCode);
  185. /**
  186. * Lowercases a UTF-8 string and optionally records edits.
  187. * Casing is locale-dependent and context-sensitive.
  188. * The result may be longer or shorter than the original.
  189. *
  190. * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
  191. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  192. * @param src The original string.
  193. * @param sink A ByteSink to which the result string is written.
  194. * sink.Flush() is called at the end.
  195. * @param edits Records edits for index mapping, working with styled text,
  196. * and getting only changes (if any).
  197. * The Edits contents is undefined if any error occurs.
  198. * This function calls edits->reset() first unless
  199. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  200. * @param errorCode Reference to an in/out error code value
  201. * which must not indicate a failure before the function call.
  202. *
  203. * @see ucasemap_utf8ToLower
  204. * @stable ICU 60
  205. */
  206. static void utf8ToLower(
  207. const char *locale, uint32_t options,
  208. StringPiece src, ByteSink &sink, Edits *edits,
  209. UErrorCode &errorCode);
  210. /**
  211. * Uppercases a UTF-8 string and optionally records edits.
  212. * Casing is locale-dependent and context-sensitive.
  213. * The result may be longer or shorter than the original.
  214. *
  215. * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
  216. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  217. * @param src The original string.
  218. * @param sink A ByteSink to which the result string is written.
  219. * sink.Flush() is called at the end.
  220. * @param edits Records edits for index mapping, working with styled text,
  221. * and getting only changes (if any).
  222. * The Edits contents is undefined if any error occurs.
  223. * This function calls edits->reset() first unless
  224. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  225. * @param errorCode Reference to an in/out error code value
  226. * which must not indicate a failure before the function call.
  227. *
  228. * @see ucasemap_utf8ToUpper
  229. * @stable ICU 60
  230. */
  231. static void utf8ToUpper(
  232. const char *locale, uint32_t options,
  233. StringPiece src, ByteSink &sink, Edits *edits,
  234. UErrorCode &errorCode);
  235. #if !UCONFIG_NO_BREAK_ITERATION
  236. /**
  237. * Titlecases a UTF-8 string and optionally records edits.
  238. * Casing is locale-dependent and context-sensitive.
  239. * The result may be longer or shorter than the original.
  240. *
  241. * Titlecasing uses a break iterator to find the first characters of words
  242. * that are to be titlecased. It titlecases those characters and lowercases
  243. * all others. (This can be modified with options bits.)
  244. *
  245. * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
  246. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
  247. * U_TITLECASE_NO_LOWERCASE,
  248. * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
  249. * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
  250. * @param iter A break iterator to find the first characters of words that are to be titlecased.
  251. * It is set to the source string (setUText())
  252. * and used one or more times for iteration (first() and next()).
  253. * If nullptr, then a word break iterator for the locale is used
  254. * (or something equivalent).
  255. * @param src The original string.
  256. * @param sink A ByteSink to which the result string is written.
  257. * sink.Flush() is called at the end.
  258. * @param edits Records edits for index mapping, working with styled text,
  259. * and getting only changes (if any).
  260. * The Edits contents is undefined if any error occurs.
  261. * This function calls edits->reset() first unless
  262. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  263. * @param errorCode Reference to an in/out error code value
  264. * which must not indicate a failure before the function call.
  265. *
  266. * @see ucasemap_utf8ToTitle
  267. * @stable ICU 60
  268. */
  269. static void utf8ToTitle(
  270. const char *locale, uint32_t options, BreakIterator *iter,
  271. StringPiece src, ByteSink &sink, Edits *edits,
  272. UErrorCode &errorCode);
  273. #endif // UCONFIG_NO_BREAK_ITERATION
  274. /**
  275. * Case-folds a UTF-8 string and optionally records edits.
  276. *
  277. * Case folding is locale-independent and not context-sensitive,
  278. * but there is an option for whether to include or exclude mappings for dotted I
  279. * and dotless i that are marked with 'T' in CaseFolding.txt.
  280. *
  281. * The result may be longer or shorter than the original.
  282. *
  283. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  284. * @param src The original string.
  285. * @param sink A ByteSink to which the result string is written.
  286. * sink.Flush() is called at the end.
  287. * @param edits Records edits for index mapping, working with styled text,
  288. * and getting only changes (if any).
  289. * The Edits contents is undefined if any error occurs.
  290. * This function calls edits->reset() first unless
  291. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  292. * @param errorCode Reference to an in/out error code value
  293. * which must not indicate a failure before the function call.
  294. *
  295. * @see ucasemap_utf8FoldCase
  296. * @stable ICU 60
  297. */
  298. static void utf8Fold(
  299. uint32_t options,
  300. StringPiece src, ByteSink &sink, Edits *edits,
  301. UErrorCode &errorCode);
  302. /**
  303. * Lowercases a UTF-8 string and optionally records edits.
  304. * Casing is locale-dependent and context-sensitive.
  305. * The result may be longer or shorter than the original.
  306. * The source string and the destination buffer must not overlap.
  307. *
  308. * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
  309. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  310. * @param src The original string.
  311. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  312. * @param dest A buffer for the result string. The result will be NUL-terminated if
  313. * the buffer is large enough.
  314. * The contents is undefined in case of failure.
  315. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
  316. * dest may be nullptr and the function will only return the length of the result
  317. * without writing any of the result string.
  318. * @param edits Records edits for index mapping, working with styled text,
  319. * and getting only changes (if any).
  320. * The Edits contents is undefined if any error occurs.
  321. * This function calls edits->reset() first unless
  322. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  323. * @param errorCode Reference to an in/out error code value
  324. * which must not indicate a failure before the function call.
  325. * @return The length of the result string, if successful.
  326. * When the result would be longer than destCapacity,
  327. * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
  328. *
  329. * @see ucasemap_utf8ToLower
  330. * @stable ICU 59
  331. */
  332. static int32_t utf8ToLower(
  333. const char *locale, uint32_t options,
  334. const char *src, int32_t srcLength,
  335. char *dest, int32_t destCapacity, Edits *edits,
  336. UErrorCode &errorCode);
  337. /**
  338. * Uppercases a UTF-8 string and optionally records edits.
  339. * Casing is locale-dependent and context-sensitive.
  340. * The result may be longer or shorter than the original.
  341. * The source string and the destination buffer must not overlap.
  342. *
  343. * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
  344. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
  345. * @param src The original string.
  346. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  347. * @param dest A buffer for the result string. The result will be NUL-terminated if
  348. * the buffer is large enough.
  349. * The contents is undefined in case of failure.
  350. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
  351. * dest may be nullptr and the function will only return the length of the result
  352. * without writing any of the result string.
  353. * @param edits Records edits for index mapping, working with styled text,
  354. * and getting only changes (if any).
  355. * The Edits contents is undefined if any error occurs.
  356. * This function calls edits->reset() first unless
  357. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  358. * @param errorCode Reference to an in/out error code value
  359. * which must not indicate a failure before the function call.
  360. * @return The length of the result string, if successful.
  361. * When the result would be longer than destCapacity,
  362. * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
  363. *
  364. * @see ucasemap_utf8ToUpper
  365. * @stable ICU 59
  366. */
  367. static int32_t utf8ToUpper(
  368. const char *locale, uint32_t options,
  369. const char *src, int32_t srcLength,
  370. char *dest, int32_t destCapacity, Edits *edits,
  371. UErrorCode &errorCode);
  372. #if !UCONFIG_NO_BREAK_ITERATION
  373. /**
  374. * Titlecases a UTF-8 string and optionally records edits.
  375. * Casing is locale-dependent and context-sensitive.
  376. * The result may be longer or shorter than the original.
  377. * The source string and the destination buffer must not overlap.
  378. *
  379. * Titlecasing uses a break iterator to find the first characters of words
  380. * that are to be titlecased. It titlecases those characters and lowercases
  381. * all others. (This can be modified with options bits.)
  382. *
  383. * @param locale The locale ID. ("" = root locale, nullptr = default locale.)
  384. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
  385. * U_TITLECASE_NO_LOWERCASE,
  386. * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
  387. * U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
  388. * @param iter A break iterator to find the first characters of words that are to be titlecased.
  389. * It is set to the source string (setUText())
  390. * and used one or more times for iteration (first() and next()).
  391. * If nullptr, then a word break iterator for the locale is used
  392. * (or something equivalent).
  393. * @param src The original string.
  394. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  395. * @param dest A buffer for the result string. The result will be NUL-terminated if
  396. * the buffer is large enough.
  397. * The contents is undefined in case of failure.
  398. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
  399. * dest may be nullptr and the function will only return the length of the result
  400. * without writing any of the result string.
  401. * @param edits Records edits for index mapping, working with styled text,
  402. * and getting only changes (if any).
  403. * The Edits contents is undefined if any error occurs.
  404. * This function calls edits->reset() first unless
  405. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  406. * @param errorCode Reference to an in/out error code value
  407. * which must not indicate a failure before the function call.
  408. * @return The length of the result string, if successful.
  409. * When the result would be longer than destCapacity,
  410. * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
  411. *
  412. * @see ucasemap_utf8ToTitle
  413. * @stable ICU 59
  414. */
  415. static int32_t utf8ToTitle(
  416. const char *locale, uint32_t options, BreakIterator *iter,
  417. const char *src, int32_t srcLength,
  418. char *dest, int32_t destCapacity, Edits *edits,
  419. UErrorCode &errorCode);
  420. #endif // UCONFIG_NO_BREAK_ITERATION
  421. /**
  422. * Case-folds a UTF-8 string and optionally records edits.
  423. *
  424. * Case folding is locale-independent and not context-sensitive,
  425. * but there is an option for whether to include or exclude mappings for dotted I
  426. * and dotless i that are marked with 'T' in CaseFolding.txt.
  427. *
  428. * The result may be longer or shorter than the original.
  429. * The source string and the destination buffer must not overlap.
  430. *
  431. * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
  432. * U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
  433. * @param src The original string.
  434. * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
  435. * @param dest A buffer for the result string. The result will be NUL-terminated if
  436. * the buffer is large enough.
  437. * The contents is undefined in case of failure.
  438. * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
  439. * dest may be nullptr and the function will only return the length of the result
  440. * without writing any of the result string.
  441. * @param edits Records edits for index mapping, working with styled text,
  442. * and getting only changes (if any).
  443. * The Edits contents is undefined if any error occurs.
  444. * This function calls edits->reset() first unless
  445. * options includes U_EDITS_NO_RESET. edits can be nullptr.
  446. * @param errorCode Reference to an in/out error code value
  447. * which must not indicate a failure before the function call.
  448. * @return The length of the result string, if successful.
  449. * When the result would be longer than destCapacity,
  450. * the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
  451. *
  452. * @see ucasemap_utf8FoldCase
  453. * @stable ICU 59
  454. */
  455. static int32_t utf8Fold(
  456. uint32_t options,
  457. const char *src, int32_t srcLength,
  458. char *dest, int32_t destCapacity, Edits *edits,
  459. UErrorCode &errorCode);
  460. private:
  461. CaseMap() = delete;
  462. CaseMap(const CaseMap &other) = delete;
  463. CaseMap &operator=(const CaseMap &other) = delete;
  464. };
  465. U_NAMESPACE_END
  466. #endif /* U_SHOW_CPLUSPLUS_API */
  467. #endif // __CASEMAP_H__