uset.h 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121
  1. /*
  2. *******************************************************************************
  3. *
  4. * Copyright (C) 2002-2010, International Business Machines
  5. * Corporation and others. All Rights Reserved.
  6. *
  7. *******************************************************************************
  8. * file name: uset.h
  9. * encoding: US-ASCII
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2002mar07
  14. * created by: Markus W. Scherer
  15. *
  16. * C version of UnicodeSet.
  17. */
  18. /**
  19. * \file
  20. * \brief C API: Unicode Set
  21. *
  22. * <p>This is a C wrapper around the C++ UnicodeSet class.</p>
  23. */
  24. #ifndef __USET_H__
  25. #define __USET_H__
  26. #include "unicode/utypes.h"
  27. #include "unicode/uchar.h"
  28. #include "unicode/localpointer.h"
  29. #ifndef UCNV_H
  30. struct USet;
  31. /**
  32. * A UnicodeSet. Use the uset_* API to manipulate. Create with
  33. * uset_open*, and destroy with uset_close.
  34. * @stable ICU 2.4
  35. */
  36. typedef struct USet USet;
  37. #endif
  38. /**
  39. * Bitmask values to be passed to uset_openPatternOptions() or
  40. * uset_applyPattern() taking an option parameter.
  41. * @stable ICU 2.4
  42. */
  43. enum {
  44. /**
  45. * Ignore white space within patterns unless quoted or escaped.
  46. * @stable ICU 2.4
  47. */
  48. USET_IGNORE_SPACE = 1,
  49. /**
  50. * Enable case insensitive matching. E.g., "[ab]" with this flag
  51. * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
  52. * match all except 'a', 'A', 'b', and 'B'. This performs a full
  53. * closure over case mappings, e.g. U+017F for s.
  54. *
  55. * The resulting set is a superset of the input for the code points but
  56. * not for the strings.
  57. * It performs a case mapping closure of the code points and adds
  58. * full case folding strings for the code points, and reduces strings of
  59. * the original set to their full case folding equivalents.
  60. *
  61. * This is designed for case-insensitive matches, for example
  62. * in regular expressions. The full code point case closure allows checking of
  63. * an input character directly against the closure set.
  64. * Strings are matched by comparing the case-folded form from the closure
  65. * set with an incremental case folding of the string in question.
  66. *
  67. * The closure set will also contain single code points if the original
  68. * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
  69. * This is not necessary (that is, redundant) for the above matching method
  70. * but results in the same closure sets regardless of whether the original
  71. * set contained the code point or a string.
  72. *
  73. * @stable ICU 2.4
  74. */
  75. USET_CASE_INSENSITIVE = 2,
  76. /**
  77. * Enable case insensitive matching. E.g., "[ab]" with this flag
  78. * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
  79. * match all except 'a', 'A', 'b', and 'B'. This adds the lower-,
  80. * title-, and uppercase mappings as well as the case folding
  81. * of each existing element in the set.
  82. * @stable ICU 3.2
  83. */
  84. USET_ADD_CASE_MAPPINGS = 4,
  85. /**
  86. * Enough for any single-code point set
  87. * @internal
  88. */
  89. USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
  90. };
  91. /**
  92. * Argument values for whether span() and similar functions continue while
  93. * the current character is contained vs. not contained in the set.
  94. *
  95. * The functionality is straightforward for sets with only single code points,
  96. * without strings (which is the common case):
  97. * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE
  98. * work the same.
  99. * - span() and spanBack() partition any string the same way when
  100. * alternating between span(USET_SPAN_NOT_CONTAINED) and
  101. * span(either "contained" condition).
  102. * - Using a complemented (inverted) set and the opposite span conditions
  103. * yields the same results.
  104. *
  105. * When a set contains multi-code point strings, then these statements may not
  106. * be true, depending on the strings in the set (for example, whether they
  107. * overlap with each other) and the string that is processed.
  108. * For a set with strings:
  109. * - The complement of the set contains the opposite set of code points,
  110. * but the same set of strings.
  111. * Therefore, complementing both the set and the span conditions
  112. * may yield different results.
  113. * - When starting spans at different positions in a string
  114. * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
  115. * because a set string may start before the later position.
  116. * - span(USET_SPAN_SIMPLE) may be shorter than
  117. * span(USET_SPAN_CONTAINED) because it will not recursively try
  118. * all possible paths.
  119. * For example, with a set which contains the three strings "xy", "xya" and "ax",
  120. * span("xyax", USET_SPAN_CONTAINED) will return 4 but
  121. * span("xyax", USET_SPAN_SIMPLE) will return 3.
  122. * span(USET_SPAN_SIMPLE) will never be longer than
  123. * span(USET_SPAN_CONTAINED).
  124. * - With either "contained" condition, span() and spanBack() may partition
  125. * a string in different ways.
  126. * For example, with a set which contains the two strings "ab" and "ba",
  127. * and when processing the string "aba",
  128. * span() will yield contained/not-contained boundaries of { 0, 2, 3 }
  129. * while spanBack() will yield boundaries of { 0, 1, 3 }.
  130. *
  131. * Note: If it is important to get the same boundaries whether iterating forward
  132. * or backward through a string, then either only span() should be used and
  133. * the boundaries cached for backward operation, or an ICU BreakIterator
  134. * could be used.
  135. *
  136. * Note: Unpaired surrogates are treated like surrogate code points.
  137. * Similarly, set strings match only on code point boundaries,
  138. * never in the middle of a surrogate pair.
  139. * Illegal UTF-8 sequences are treated like U+FFFD.
  140. * When processing UTF-8 strings, malformed set strings
  141. * (strings with unpaired surrogates which cannot be converted to UTF-8)
  142. * are ignored.
  143. *
  144. * @stable ICU 3.8
  145. */
  146. typedef enum USetSpanCondition {
  147. /**
  148. * Continue a span() while there is no set element at the current position.
  149. * Stops before the first set element (character or string).
  150. * (For code points only, this is like while contains(current)==FALSE).
  151. *
  152. * When span() returns, the substring between where it started and the position
  153. * it returned consists only of characters that are not in the set,
  154. * and none of its strings overlap with the span.
  155. *
  156. * @stable ICU 3.8
  157. */
  158. USET_SPAN_NOT_CONTAINED = 0,
  159. /**
  160. * Continue a span() while there is a set element at the current position.
  161. * (For characters only, this is like while contains(current)==TRUE).
  162. *
  163. * When span() returns, the substring between where it started and the position
  164. * it returned consists only of set elements (characters or strings) that are in the set.
  165. *
  166. * If a set contains strings, then the span will be the longest substring
  167. * matching any of the possible concatenations of set elements (characters or strings).
  168. * (There must be a single, non-overlapping concatenation of characters or strings.)
  169. * This is equivalent to a POSIX regular expression for (OR of each set element)*.
  170. *
  171. * @stable ICU 3.8
  172. */
  173. USET_SPAN_CONTAINED = 1,
  174. /**
  175. * Continue a span() while there is a set element at the current position.
  176. * (For characters only, this is like while contains(current)==TRUE).
  177. *
  178. * When span() returns, the substring between where it started and the position
  179. * it returned consists only of set elements (characters or strings) that are in the set.
  180. *
  181. * If a set only contains single characters, then this is the same
  182. * as USET_SPAN_CONTAINED.
  183. *
  184. * If a set contains strings, then the span will be the longest substring
  185. * with a match at each position with the longest single set element (character or string).
  186. *
  187. * Use this span condition together with other longest-match algorithms,
  188. * such as ICU converters (ucnv_getUnicodeSet()).
  189. *
  190. * @stable ICU 3.8
  191. */
  192. USET_SPAN_SIMPLE = 2,
  193. /**
  194. * One more than the last span condition.
  195. * @stable ICU 3.8
  196. */
  197. USET_SPAN_CONDITION_COUNT
  198. } USetSpanCondition;
  199. /**
  200. * A serialized form of a Unicode set. Limited manipulations are
  201. * possible directly on a serialized set. See below.
  202. * @stable ICU 2.4
  203. */
  204. typedef struct USerializedSet {
  205. /**
  206. * The serialized Unicode Set.
  207. * @stable ICU 2.4
  208. */
  209. const uint16_t *array;
  210. /**
  211. * The length of the array that contains BMP characters.
  212. * @stable ICU 2.4
  213. */
  214. int32_t bmpLength;
  215. /**
  216. * The total length of the array.
  217. * @stable ICU 2.4
  218. */
  219. int32_t length;
  220. /**
  221. * A small buffer for the array to reduce memory allocations.
  222. * @stable ICU 2.4
  223. */
  224. uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY];
  225. } USerializedSet;
  226. /*********************************************************************
  227. * USet API
  228. *********************************************************************/
  229. /**
  230. * Create an empty USet object.
  231. * Equivalent to uset_open(1, 0).
  232. * @return a newly created USet. The caller must call uset_close() on
  233. * it when done.
  234. * @stable ICU 4.2
  235. */
  236. U_STABLE USet* U_EXPORT2
  237. uset_openEmpty();
  238. /**
  239. * Creates a USet object that contains the range of characters
  240. * start..end, inclusive. If <code>start > end</code>
  241. * then an empty set is created (same as using uset_openEmpty()).
  242. * @param start first character of the range, inclusive
  243. * @param end last character of the range, inclusive
  244. * @return a newly created USet. The caller must call uset_close() on
  245. * it when done.
  246. * @stable ICU 2.4
  247. */
  248. U_STABLE USet* U_EXPORT2
  249. uset_open(UChar32 start, UChar32 end);
  250. /**
  251. * Creates a set from the given pattern. See the UnicodeSet class
  252. * description for the syntax of the pattern language.
  253. * @param pattern a string specifying what characters are in the set
  254. * @param patternLength the length of the pattern, or -1 if null
  255. * terminated
  256. * @param ec the error code
  257. * @stable ICU 2.4
  258. */
  259. U_STABLE USet* U_EXPORT2
  260. uset_openPattern(const UChar* pattern, int32_t patternLength,
  261. UErrorCode* ec);
  262. /**
  263. * Creates a set from the given pattern. See the UnicodeSet class
  264. * description for the syntax of the pattern language.
  265. * @param pattern a string specifying what characters are in the set
  266. * @param patternLength the length of the pattern, or -1 if null
  267. * terminated
  268. * @param options bitmask for options to apply to the pattern.
  269. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
  270. * @param ec the error code
  271. * @stable ICU 2.4
  272. */
  273. U_STABLE USet* U_EXPORT2
  274. uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
  275. uint32_t options,
  276. UErrorCode* ec);
  277. /**
  278. * Disposes of the storage used by a USet object. This function should
  279. * be called exactly once for objects returned by uset_open().
  280. * @param set the object to dispose of
  281. * @stable ICU 2.4
  282. */
  283. U_STABLE void U_EXPORT2
  284. uset_close(USet* set);
  285. #if U_SHOW_CPLUSPLUS_API
  286. U_NAMESPACE_BEGIN
  287. /**
  288. * \class LocalUSetPointer
  289. * "Smart pointer" class, closes a USet via uset_close().
  290. * For most methods see the LocalPointerBase base class.
  291. *
  292. * @see LocalPointerBase
  293. * @see LocalPointer
  294. * @stable ICU 4.4
  295. */
  296. U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close);
  297. U_NAMESPACE_END
  298. #endif
  299. /**
  300. * Returns a copy of this object.
  301. * If this set is frozen, then the clone will be frozen as well.
  302. * Use uset_cloneAsThawed() for a mutable clone of a frozen set.
  303. * @param set the original set
  304. * @return the newly allocated copy of the set
  305. * @see uset_cloneAsThawed
  306. * @stable ICU 3.8
  307. */
  308. U_STABLE USet * U_EXPORT2
  309. uset_clone(const USet *set);
  310. /**
  311. * Determines whether the set has been frozen (made immutable) or not.
  312. * See the ICU4J Freezable interface for details.
  313. * @param set the set
  314. * @return TRUE/FALSE for whether the set has been frozen
  315. * @see uset_freeze
  316. * @see uset_cloneAsThawed
  317. * @stable ICU 3.8
  318. */
  319. U_STABLE UBool U_EXPORT2
  320. uset_isFrozen(const USet *set);
  321. /**
  322. * Freeze the set (make it immutable).
  323. * Once frozen, it cannot be unfrozen and is therefore thread-safe
  324. * until it is deleted.
  325. * See the ICU4J Freezable interface for details.
  326. * Freezing the set may also make some operations faster, for example
  327. * uset_contains() and uset_span().
  328. * A frozen set will not be modified. (It remains frozen.)
  329. * @param set the set
  330. * @return the same set, now frozen
  331. * @see uset_isFrozen
  332. * @see uset_cloneAsThawed
  333. * @stable ICU 3.8
  334. */
  335. U_STABLE void U_EXPORT2
  336. uset_freeze(USet *set);
  337. /**
  338. * Clone the set and make the clone mutable.
  339. * See the ICU4J Freezable interface for details.
  340. * @param set the set
  341. * @return the mutable clone
  342. * @see uset_freeze
  343. * @see uset_isFrozen
  344. * @see uset_clone
  345. * @stable ICU 3.8
  346. */
  347. U_STABLE USet * U_EXPORT2
  348. uset_cloneAsThawed(const USet *set);
  349. /**
  350. * Causes the USet object to represent the range <code>start - end</code>.
  351. * If <code>start > end</code> then this USet is set to an empty range.
  352. * A frozen set will not be modified.
  353. * @param set the object to set to the given range
  354. * @param start first character in the set, inclusive
  355. * @param end last character in the set, inclusive
  356. * @stable ICU 3.2
  357. */
  358. U_STABLE void U_EXPORT2
  359. uset_set(USet* set,
  360. UChar32 start, UChar32 end);
  361. /**
  362. * Modifies the set to represent the set specified by the given
  363. * pattern. See the UnicodeSet class description for the syntax of
  364. * the pattern language. See also the User Guide chapter about UnicodeSet.
  365. * <em>Empties the set passed before applying the pattern.</em>
  366. * A frozen set will not be modified.
  367. * @param set The set to which the pattern is to be applied.
  368. * @param pattern A pointer to UChar string specifying what characters are in the set.
  369. * The character at pattern[0] must be a '['.
  370. * @param patternLength The length of the UChar string. -1 if NUL terminated.
  371. * @param options A bitmask for options to apply to the pattern.
  372. * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
  373. * @param status Returns an error if the pattern cannot be parsed.
  374. * @return Upon successful parse, the value is either
  375. * the index of the character after the closing ']'
  376. * of the parsed pattern.
  377. * If the status code indicates failure, then the return value
  378. * is the index of the error in the source.
  379. *
  380. * @stable ICU 2.8
  381. */
  382. U_STABLE int32_t U_EXPORT2
  383. uset_applyPattern(USet *set,
  384. const UChar *pattern, int32_t patternLength,
  385. uint32_t options,
  386. UErrorCode *status);
  387. /**
  388. * Modifies the set to contain those code points which have the given value
  389. * for the given binary or enumerated property, as returned by
  390. * u_getIntPropertyValue. Prior contents of this set are lost.
  391. * A frozen set will not be modified.
  392. *
  393. * @param set the object to contain the code points defined by the property
  394. *
  395. * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
  396. * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
  397. * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
  398. *
  399. * @param value a value in the range u_getIntPropertyMinValue(prop)..
  400. * u_getIntPropertyMaxValue(prop), with one exception. If prop is
  401. * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
  402. * rather a mask value produced by U_GET_GC_MASK(). This allows grouped
  403. * categories such as [:L:] to be represented.
  404. *
  405. * @param ec error code input/output parameter
  406. *
  407. * @stable ICU 3.2
  408. */
  409. U_STABLE void U_EXPORT2
  410. uset_applyIntPropertyValue(USet* set,
  411. UProperty prop, int32_t value, UErrorCode* ec);
  412. /**
  413. * Modifies the set to contain those code points which have the
  414. * given value for the given property. Prior contents of this
  415. * set are lost.
  416. * A frozen set will not be modified.
  417. *
  418. * @param set the object to contain the code points defined by the given
  419. * property and value alias
  420. *
  421. * @param prop a string specifying a property alias, either short or long.
  422. * The name is matched loosely. See PropertyAliases.txt for names and a
  423. * description of loose matching. If the value string is empty, then this
  424. * string is interpreted as either a General_Category value alias, a Script
  425. * value alias, a binary property alias, or a special ID. Special IDs are
  426. * matched loosely and correspond to the following sets:
  427. *
  428. * "ANY" = [\\u0000-\\U0010FFFF],
  429. * "ASCII" = [\\u0000-\\u007F],
  430. * "Assigned" = [:^Cn:].
  431. *
  432. * @param propLength the length of the prop, or -1 if NULL
  433. *
  434. * @param value a string specifying a value alias, either short or long.
  435. * The name is matched loosely. See PropertyValueAliases.txt for names
  436. * and a description of loose matching. In addition to aliases listed,
  437. * numeric values and canonical combining classes may be expressed
  438. * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string
  439. * may also be empty.
  440. *
  441. * @param valueLength the length of the value, or -1 if NULL
  442. *
  443. * @param ec error code input/output parameter
  444. *
  445. * @stable ICU 3.2
  446. */
  447. U_STABLE void U_EXPORT2
  448. uset_applyPropertyAlias(USet* set,
  449. const UChar *prop, int32_t propLength,
  450. const UChar *value, int32_t valueLength,
  451. UErrorCode* ec);
  452. /**
  453. * Return true if the given position, in the given pattern, appears
  454. * to be the start of a UnicodeSet pattern.
  455. *
  456. * @param pattern a string specifying the pattern
  457. * @param patternLength the length of the pattern, or -1 if NULL
  458. * @param pos the given position
  459. * @stable ICU 3.2
  460. */
  461. U_STABLE UBool U_EXPORT2
  462. uset_resemblesPattern(const UChar *pattern, int32_t patternLength,
  463. int32_t pos);
  464. /**
  465. * Returns a string representation of this set. If the result of
  466. * calling this function is passed to a uset_openPattern(), it
  467. * will produce another set that is equal to this one.
  468. * @param set the set
  469. * @param result the string to receive the rules, may be NULL
  470. * @param resultCapacity the capacity of result, may be 0 if result is NULL
  471. * @param escapeUnprintable if TRUE then convert unprintable
  472. * character to their hex escape representations, \\uxxxx or
  473. * \\Uxxxxxxxx. Unprintable characters are those other than
  474. * U+000A, U+0020..U+007E.
  475. * @param ec error code.
  476. * @return length of string, possibly larger than resultCapacity
  477. * @stable ICU 2.4
  478. */
  479. U_STABLE int32_t U_EXPORT2
  480. uset_toPattern(const USet* set,
  481. UChar* result, int32_t resultCapacity,
  482. UBool escapeUnprintable,
  483. UErrorCode* ec);
  484. /**
  485. * Adds the given character to the given USet. After this call,
  486. * uset_contains(set, c) will return TRUE.
  487. * A frozen set will not be modified.
  488. * @param set the object to which to add the character
  489. * @param c the character to add
  490. * @stable ICU 2.4
  491. */
  492. U_STABLE void U_EXPORT2
  493. uset_add(USet* set, UChar32 c);
  494. /**
  495. * Adds all of the elements in the specified set to this set if
  496. * they're not already present. This operation effectively
  497. * modifies this set so that its value is the <i>union</i> of the two
  498. * sets. The behavior of this operation is unspecified if the specified
  499. * collection is modified while the operation is in progress.
  500. * A frozen set will not be modified.
  501. *
  502. * @param set the object to which to add the set
  503. * @param additionalSet the source set whose elements are to be added to this set.
  504. * @stable ICU 2.6
  505. */
  506. U_STABLE void U_EXPORT2
  507. uset_addAll(USet* set, const USet *additionalSet);
  508. /**
  509. * Adds the given range of characters to the given USet. After this call,
  510. * uset_contains(set, start, end) will return TRUE.
  511. * A frozen set will not be modified.
  512. * @param set the object to which to add the character
  513. * @param start the first character of the range to add, inclusive
  514. * @param end the last character of the range to add, inclusive
  515. * @stable ICU 2.2
  516. */
  517. U_STABLE void U_EXPORT2
  518. uset_addRange(USet* set, UChar32 start, UChar32 end);
  519. /**
  520. * Adds the given string to the given USet. After this call,
  521. * uset_containsString(set, str, strLen) will return TRUE.
  522. * A frozen set will not be modified.
  523. * @param set the object to which to add the character
  524. * @param str the string to add
  525. * @param strLen the length of the string or -1 if null terminated.
  526. * @stable ICU 2.4
  527. */
  528. U_STABLE void U_EXPORT2
  529. uset_addString(USet* set, const UChar* str, int32_t strLen);
  530. /**
  531. * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
  532. * If this set already any particular character, it has no effect on that character.
  533. * A frozen set will not be modified.
  534. * @param set the object to which to add the character
  535. * @param str the source string
  536. * @param strLen the length of the string or -1 if null terminated.
  537. * @stable ICU 3.4
  538. */
  539. U_STABLE void U_EXPORT2
  540. uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen);
  541. /**
  542. * Removes the given character from the given USet. After this call,
  543. * uset_contains(set, c) will return FALSE.
  544. * A frozen set will not be modified.
  545. * @param set the object from which to remove the character
  546. * @param c the character to remove
  547. * @stable ICU 2.4
  548. */
  549. U_STABLE void U_EXPORT2
  550. uset_remove(USet* set, UChar32 c);
  551. /**
  552. * Removes the given range of characters from the given USet. After this call,
  553. * uset_contains(set, start, end) will return FALSE.
  554. * A frozen set will not be modified.
  555. * @param set the object to which to add the character
  556. * @param start the first character of the range to remove, inclusive
  557. * @param end the last character of the range to remove, inclusive
  558. * @stable ICU 2.2
  559. */
  560. U_STABLE void U_EXPORT2
  561. uset_removeRange(USet* set, UChar32 start, UChar32 end);
  562. /**
  563. * Removes the given string to the given USet. After this call,
  564. * uset_containsString(set, str, strLen) will return FALSE.
  565. * A frozen set will not be modified.
  566. * @param set the object to which to add the character
  567. * @param str the string to remove
  568. * @param strLen the length of the string or -1 if null terminated.
  569. * @stable ICU 2.4
  570. */
  571. U_STABLE void U_EXPORT2
  572. uset_removeString(USet* set, const UChar* str, int32_t strLen);
  573. /**
  574. * Removes from this set all of its elements that are contained in the
  575. * specified set. This operation effectively modifies this
  576. * set so that its value is the <i>asymmetric set difference</i> of
  577. * the two sets.
  578. * A frozen set will not be modified.
  579. * @param set the object from which the elements are to be removed
  580. * @param removeSet the object that defines which elements will be
  581. * removed from this set
  582. * @stable ICU 3.2
  583. */
  584. U_STABLE void U_EXPORT2
  585. uset_removeAll(USet* set, const USet* removeSet);
  586. /**
  587. * Retain only the elements in this set that are contained in the
  588. * specified range. If <code>start > end</code> then an empty range is
  589. * retained, leaving the set empty. This is equivalent to
  590. * a boolean logic AND, or a set INTERSECTION.
  591. * A frozen set will not be modified.
  592. *
  593. * @param set the object for which to retain only the specified range
  594. * @param start first character, inclusive, of range to be retained
  595. * to this set.
  596. * @param end last character, inclusive, of range to be retained
  597. * to this set.
  598. * @stable ICU 3.2
  599. */
  600. U_STABLE void U_EXPORT2
  601. uset_retain(USet* set, UChar32 start, UChar32 end);
  602. /**
  603. * Retains only the elements in this set that are contained in the
  604. * specified set. In other words, removes from this set all of
  605. * its elements that are not contained in the specified set. This
  606. * operation effectively modifies this set so that its value is
  607. * the <i>intersection</i> of the two sets.
  608. * A frozen set will not be modified.
  609. *
  610. * @param set the object on which to perform the retain
  611. * @param retain set that defines which elements this set will retain
  612. * @stable ICU 3.2
  613. */
  614. U_STABLE void U_EXPORT2
  615. uset_retainAll(USet* set, const USet* retain);
  616. /**
  617. * Reallocate this objects internal structures to take up the least
  618. * possible space, without changing this object's value.
  619. * A frozen set will not be modified.
  620. *
  621. * @param set the object on which to perfrom the compact
  622. * @stable ICU 3.2
  623. */
  624. U_STABLE void U_EXPORT2
  625. uset_compact(USet* set);
  626. /**
  627. * Inverts this set. This operation modifies this set so that
  628. * its value is its complement. This operation does not affect
  629. * the multicharacter strings, if any.
  630. * A frozen set will not be modified.
  631. * @param set the set
  632. * @stable ICU 2.4
  633. */
  634. U_STABLE void U_EXPORT2
  635. uset_complement(USet* set);
  636. /**
  637. * Complements in this set all elements contained in the specified
  638. * set. Any character in the other set will be removed if it is
  639. * in this set, or will be added if it is not in this set.
  640. * A frozen set will not be modified.
  641. *
  642. * @param set the set with which to complement
  643. * @param complement set that defines which elements will be xor'ed
  644. * from this set.
  645. * @stable ICU 3.2
  646. */
  647. U_STABLE void U_EXPORT2
  648. uset_complementAll(USet* set, const USet* complement);
  649. /**
  650. * Removes all of the elements from this set. This set will be
  651. * empty after this call returns.
  652. * A frozen set will not be modified.
  653. * @param set the set
  654. * @stable ICU 2.4
  655. */
  656. U_STABLE void U_EXPORT2
  657. uset_clear(USet* set);
  658. /**
  659. * Close this set over the given attribute. For the attribute
  660. * USET_CASE, the result is to modify this set so that:
  661. *
  662. * 1. For each character or string 'a' in this set, all strings or
  663. * characters 'b' such that foldCase(a) == foldCase(b) are added
  664. * to this set.
  665. *
  666. * 2. For each string 'e' in the resulting set, if e !=
  667. * foldCase(e), 'e' will be removed.
  668. *
  669. * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
  670. *
  671. * (Here foldCase(x) refers to the operation u_strFoldCase, and a
  672. * == b denotes that the contents are the same, not pointer
  673. * comparison.)
  674. *
  675. * A frozen set will not be modified.
  676. *
  677. * @param set the set
  678. *
  679. * @param attributes bitmask for attributes to close over.
  680. * Currently only the USET_CASE bit is supported. Any undefined bits
  681. * are ignored.
  682. * @stable ICU 4.2
  683. */
  684. U_STABLE void U_EXPORT2
  685. uset_closeOver(USet* set, int32_t attributes);
  686. /**
  687. * Remove all strings from this set.
  688. *
  689. * @param set the set
  690. * @stable ICU 4.2
  691. */
  692. U_STABLE void U_EXPORT2
  693. uset_removeAllStrings(USet* set);
  694. /**
  695. * Returns TRUE if the given USet contains no characters and no
  696. * strings.
  697. * @param set the set
  698. * @return true if set is empty
  699. * @stable ICU 2.4
  700. */
  701. U_STABLE UBool U_EXPORT2
  702. uset_isEmpty(const USet* set);
  703. /**
  704. * Returns TRUE if the given USet contains the given character.
  705. * This function works faster with a frozen set.
  706. * @param set the set
  707. * @param c The codepoint to check for within the set
  708. * @return true if set contains c
  709. * @stable ICU 2.4
  710. */
  711. U_STABLE UBool U_EXPORT2
  712. uset_contains(const USet* set, UChar32 c);
  713. /**
  714. * Returns TRUE if the given USet contains all characters c
  715. * where start <= c && c <= end.
  716. * @param set the set
  717. * @param start the first character of the range to test, inclusive
  718. * @param end the last character of the range to test, inclusive
  719. * @return TRUE if set contains the range
  720. * @stable ICU 2.2
  721. */
  722. U_STABLE UBool U_EXPORT2
  723. uset_containsRange(const USet* set, UChar32 start, UChar32 end);
  724. /**
  725. * Returns TRUE if the given USet contains the given string.
  726. * @param set the set
  727. * @param str the string
  728. * @param strLen the length of the string or -1 if null terminated.
  729. * @return true if set contains str
  730. * @stable ICU 2.4
  731. */
  732. U_STABLE UBool U_EXPORT2
  733. uset_containsString(const USet* set, const UChar* str, int32_t strLen);
  734. /**
  735. * Returns the index of the given character within this set, where
  736. * the set is ordered by ascending code point. If the character
  737. * is not in this set, return -1. The inverse of this method is
  738. * <code>charAt()</code>.
  739. * @param set the set
  740. * @param c the character to obtain the index for
  741. * @return an index from 0..size()-1, or -1
  742. * @stable ICU 3.2
  743. */
  744. U_STABLE int32_t U_EXPORT2
  745. uset_indexOf(const USet* set, UChar32 c);
  746. /**
  747. * Returns the character at the given index within this set, where
  748. * the set is ordered by ascending code point. If the index is
  749. * out of range, return (UChar32)-1. The inverse of this method is
  750. * <code>indexOf()</code>.
  751. * @param set the set
  752. * @param charIndex an index from 0..size()-1 to obtain the char for
  753. * @return the character at the given index, or (UChar32)-1.
  754. * @stable ICU 3.2
  755. */
  756. U_STABLE UChar32 U_EXPORT2
  757. uset_charAt(const USet* set, int32_t charIndex);
  758. /**
  759. * Returns the number of characters and strings contained in the given
  760. * USet.
  761. * @param set the set
  762. * @return a non-negative integer counting the characters and strings
  763. * contained in set
  764. * @stable ICU 2.4
  765. */
  766. U_STABLE int32_t U_EXPORT2
  767. uset_size(const USet* set);
  768. /**
  769. * Returns the number of items in this set. An item is either a range
  770. * of characters or a single multicharacter string.
  771. * @param set the set
  772. * @return a non-negative integer counting the character ranges
  773. * and/or strings contained in set
  774. * @stable ICU 2.4
  775. */
  776. U_STABLE int32_t U_EXPORT2
  777. uset_getItemCount(const USet* set);
  778. /**
  779. * Returns an item of this set. An item is either a range of
  780. * characters or a single multicharacter string.
  781. * @param set the set
  782. * @param itemIndex a non-negative integer in the range 0..
  783. * uset_getItemCount(set)-1
  784. * @param start pointer to variable to receive first character
  785. * in range, inclusive
  786. * @param end pointer to variable to receive last character in range,
  787. * inclusive
  788. * @param str buffer to receive the string, may be NULL
  789. * @param strCapacity capacity of str, or 0 if str is NULL
  790. * @param ec error code
  791. * @return the length of the string (>= 2), or 0 if the item is a
  792. * range, in which case it is the range *start..*end, or -1 if
  793. * itemIndex is out of range
  794. * @stable ICU 2.4
  795. */
  796. U_STABLE int32_t U_EXPORT2
  797. uset_getItem(const USet* set, int32_t itemIndex,
  798. UChar32* start, UChar32* end,
  799. UChar* str, int32_t strCapacity,
  800. UErrorCode* ec);
  801. /**
  802. * Returns true if set1 contains all the characters and strings
  803. * of set2. It answers the question, 'Is set1 a superset of set2?'
  804. * @param set1 set to be checked for containment
  805. * @param set2 set to be checked for containment
  806. * @return true if the test condition is met
  807. * @stable ICU 3.2
  808. */
  809. U_STABLE UBool U_EXPORT2
  810. uset_containsAll(const USet* set1, const USet* set2);
  811. /**
  812. * Returns true if this set contains all the characters
  813. * of the given string. This is does not check containment of grapheme
  814. * clusters, like uset_containsString.
  815. * @param set set of characters to be checked for containment
  816. * @param str string containing codepoints to be checked for containment
  817. * @param strLen the length of the string or -1 if null terminated.
  818. * @return true if the test condition is met
  819. * @stable ICU 3.4
  820. */
  821. U_STABLE UBool U_EXPORT2
  822. uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen);
  823. /**
  824. * Returns true if set1 contains none of the characters and strings
  825. * of set2. It answers the question, 'Is set1 a disjoint set of set2?'
  826. * @param set1 set to be checked for containment
  827. * @param set2 set to be checked for containment
  828. * @return true if the test condition is met
  829. * @stable ICU 3.2
  830. */
  831. U_STABLE UBool U_EXPORT2
  832. uset_containsNone(const USet* set1, const USet* set2);
  833. /**
  834. * Returns true if set1 contains some of the characters and strings
  835. * of set2. It answers the question, 'Does set1 and set2 have an intersection?'
  836. * @param set1 set to be checked for containment
  837. * @param set2 set to be checked for containment
  838. * @return true if the test condition is met
  839. * @stable ICU 3.2
  840. */
  841. U_STABLE UBool U_EXPORT2
  842. uset_containsSome(const USet* set1, const USet* set2);
  843. /**
  844. * Returns the length of the initial substring of the input string which
  845. * consists only of characters and strings that are contained in this set
  846. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  847. * or only of characters and strings that are not contained
  848. * in this set (USET_SPAN_NOT_CONTAINED).
  849. * See USetSpanCondition for details.
  850. * Similar to the strspn() C library function.
  851. * Unpaired surrogates are treated according to contains() of their surrogate code points.
  852. * This function works faster with a frozen set and with a non-negative string length argument.
  853. * @param set the set
  854. * @param s start of the string
  855. * @param length of the string; can be -1 for NUL-terminated
  856. * @param spanCondition specifies the containment condition
  857. * @return the length of the initial substring according to the spanCondition;
  858. * 0 if the start of the string does not fit the spanCondition
  859. * @stable ICU 3.8
  860. * @see USetSpanCondition
  861. */
  862. U_STABLE int32_t U_EXPORT2
  863. uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
  864. /**
  865. * Returns the start of the trailing substring of the input string which
  866. * consists only of characters and strings that are contained in this set
  867. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  868. * or only of characters and strings that are not contained
  869. * in this set (USET_SPAN_NOT_CONTAINED).
  870. * See USetSpanCondition for details.
  871. * Unpaired surrogates are treated according to contains() of their surrogate code points.
  872. * This function works faster with a frozen set and with a non-negative string length argument.
  873. * @param set the set
  874. * @param s start of the string
  875. * @param length of the string; can be -1 for NUL-terminated
  876. * @param spanCondition specifies the containment condition
  877. * @return the start of the trailing substring according to the spanCondition;
  878. * the string length if the end of the string does not fit the spanCondition
  879. * @stable ICU 3.8
  880. * @see USetSpanCondition
  881. */
  882. U_STABLE int32_t U_EXPORT2
  883. uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
  884. /**
  885. * Returns the length of the initial substring of the input string which
  886. * consists only of characters and strings that are contained in this set
  887. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  888. * or only of characters and strings that are not contained
  889. * in this set (USET_SPAN_NOT_CONTAINED).
  890. * See USetSpanCondition for details.
  891. * Similar to the strspn() C library function.
  892. * Malformed byte sequences are treated according to contains(0xfffd).
  893. * This function works faster with a frozen set and with a non-negative string length argument.
  894. * @param set the set
  895. * @param s start of the string (UTF-8)
  896. * @param length of the string; can be -1 for NUL-terminated
  897. * @param spanCondition specifies the containment condition
  898. * @return the length of the initial substring according to the spanCondition;
  899. * 0 if the start of the string does not fit the spanCondition
  900. * @stable ICU 3.8
  901. * @see USetSpanCondition
  902. */
  903. U_STABLE int32_t U_EXPORT2
  904. uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
  905. /**
  906. * Returns the start of the trailing substring of the input string which
  907. * consists only of characters and strings that are contained in this set
  908. * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
  909. * or only of characters and strings that are not contained
  910. * in this set (USET_SPAN_NOT_CONTAINED).
  911. * See USetSpanCondition for details.
  912. * Malformed byte sequences are treated according to contains(0xfffd).
  913. * This function works faster with a frozen set and with a non-negative string length argument.
  914. * @param set the set
  915. * @param s start of the string (UTF-8)
  916. * @param length of the string; can be -1 for NUL-terminated
  917. * @param spanCondition specifies the containment condition
  918. * @return the start of the trailing substring according to the spanCondition;
  919. * the string length if the end of the string does not fit the spanCondition
  920. * @stable ICU 3.8
  921. * @see USetSpanCondition
  922. */
  923. U_STABLE int32_t U_EXPORT2
  924. uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
  925. /**
  926. * Returns true if set1 contains all of the characters and strings
  927. * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'
  928. * @param set1 set to be checked for containment
  929. * @param set2 set to be checked for containment
  930. * @return true if the test condition is met
  931. * @stable ICU 3.2
  932. */
  933. U_STABLE UBool U_EXPORT2
  934. uset_equals(const USet* set1, const USet* set2);
  935. /*********************************************************************
  936. * Serialized set API
  937. *********************************************************************/
  938. /**
  939. * Serializes this set into an array of 16-bit integers. Serialization
  940. * (currently) only records the characters in the set; multicharacter
  941. * strings are ignored.
  942. *
  943. * The array
  944. * has following format (each line is one 16-bit integer):
  945. *
  946. * length = (n+2*m) | (m!=0?0x8000:0)
  947. * bmpLength = n; present if m!=0
  948. * bmp[0]
  949. * bmp[1]
  950. * ...
  951. * bmp[n-1]
  952. * supp-high[0]
  953. * supp-low[0]
  954. * supp-high[1]
  955. * supp-low[1]
  956. * ...
  957. * supp-high[m-1]
  958. * supp-low[m-1]
  959. *
  960. * The array starts with a header. After the header are n bmp
  961. * code points, then m supplementary code points. Either n or m
  962. * or both may be zero. n+2*m is always <= 0x7FFF.
  963. *
  964. * If there are no supplementary characters (if m==0) then the
  965. * header is one 16-bit integer, 'length', with value n.
  966. *
  967. * If there are supplementary characters (if m!=0) then the header
  968. * is two 16-bit integers. The first, 'length', has value
  969. * (n+2*m)|0x8000. The second, 'bmpLength', has value n.
  970. *
  971. * After the header the code points are stored in ascending order.
  972. * Supplementary code points are stored as most significant 16
  973. * bits followed by least significant 16 bits.
  974. *
  975. * @param set the set
  976. * @param dest pointer to buffer of destCapacity 16-bit integers.
  977. * May be NULL only if destCapacity is zero.
  978. * @param destCapacity size of dest, or zero. Must not be negative.
  979. * @param pErrorCode pointer to the error code. Will be set to
  980. * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to
  981. * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity.
  982. * @return the total length of the serialized format, including
  983. * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
  984. * than U_BUFFER_OVERFLOW_ERROR.
  985. * @stable ICU 2.4
  986. */
  987. U_STABLE int32_t U_EXPORT2
  988. uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);
  989. /**
  990. * Given a serialized array, fill in the given serialized set object.
  991. * @param fillSet pointer to result
  992. * @param src pointer to start of array
  993. * @param srcLength length of array
  994. * @return true if the given array is valid, otherwise false
  995. * @stable ICU 2.4
  996. */
  997. U_STABLE UBool U_EXPORT2
  998. uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);
  999. /**
  1000. * Set the USerializedSet to contain the given character (and nothing
  1001. * else).
  1002. * @param fillSet pointer to result
  1003. * @param c The codepoint to set
  1004. * @stable ICU 2.4
  1005. */
  1006. U_STABLE void U_EXPORT2
  1007. uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);
  1008. /**
  1009. * Returns TRUE if the given USerializedSet contains the given
  1010. * character.
  1011. * @param set the serialized set
  1012. * @param c The codepoint to check for within the set
  1013. * @return true if set contains c
  1014. * @stable ICU 2.4
  1015. */
  1016. U_STABLE UBool U_EXPORT2
  1017. uset_serializedContains(const USerializedSet* set, UChar32 c);
  1018. /**
  1019. * Returns the number of disjoint ranges of characters contained in
  1020. * the given serialized set. Ignores any strings contained in the
  1021. * set.
  1022. * @param set the serialized set
  1023. * @return a non-negative integer counting the character ranges
  1024. * contained in set
  1025. * @stable ICU 2.4
  1026. */
  1027. U_STABLE int32_t U_EXPORT2
  1028. uset_getSerializedRangeCount(const USerializedSet* set);
  1029. /**
  1030. * Returns a range of characters contained in the given serialized
  1031. * set.
  1032. * @param set the serialized set
  1033. * @param rangeIndex a non-negative integer in the range 0..
  1034. * uset_getSerializedRangeCount(set)-1
  1035. * @param pStart pointer to variable to receive first character
  1036. * in range, inclusive
  1037. * @param pEnd pointer to variable to receive last character in range,
  1038. * inclusive
  1039. * @return true if rangeIndex is valid, otherwise false
  1040. * @stable ICU 2.4
  1041. */
  1042. U_STABLE UBool U_EXPORT2
  1043. uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
  1044. UChar32* pStart, UChar32* pEnd);
  1045. #endif