ustring.h 74 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697
  1. /*
  2. **********************************************************************
  3. * Copyright (C) 1998-2010, International Business Machines
  4. * Corporation and others. All Rights Reserved.
  5. **********************************************************************
  6. *
  7. * File ustring.h
  8. *
  9. * Modification History:
  10. *
  11. * Date Name Description
  12. * 12/07/98 bertrand Creation.
  13. ******************************************************************************
  14. */
  15. #ifndef USTRING_H
  16. #define USTRING_H
  17. #include "unicode/utypes.h"
  18. #include "unicode/putil.h"
  19. #include "unicode/uiter.h"
  20. /** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @stable ICU 2.1*/
  21. #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
  22. # define UBRK_TYPEDEF_UBREAK_ITERATOR
  23. typedef struct UBreakIterator UBreakIterator;
  24. #endif
  25. /**
  26. * \file
  27. * \brief C API: Unicode string handling functions
  28. *
  29. * These C API functions provide general Unicode string handling.
  30. *
  31. * Some functions are equivalent in name, signature, and behavior to the ANSI C <string.h>
  32. * functions. (For example, they do not check for bad arguments like NULL string pointers.)
  33. * In some cases, only the thread-safe variant of such a function is implemented here
  34. * (see u_strtok_r()).
  35. *
  36. * Other functions provide more Unicode-specific functionality like locale-specific
  37. * upper/lower-casing and string comparison in code point order.
  38. *
  39. * ICU uses 16-bit Unicode (UTF-16) in the form of arrays of UChar code units.
  40. * UTF-16 encodes each Unicode code point with either one or two UChar code units.
  41. * (This is the default form of Unicode, and a forward-compatible extension of the original,
  42. * fixed-width form that was known as UCS-2. UTF-16 superseded UCS-2 with Unicode 2.0
  43. * in 1996.)
  44. *
  45. * Some APIs accept a 32-bit UChar32 value for a single code point.
  46. *
  47. * ICU also handles 16-bit Unicode text with unpaired surrogates.
  48. * Such text is not well-formed UTF-16.
  49. * Code-point-related functions treat unpaired surrogates as surrogate code points,
  50. * i.e., as separate units.
  51. *
  52. * Although UTF-16 is a variable-width encoding form (like some legacy multi-byte encodings),
  53. * it is much more efficient even for random access because the code unit values
  54. * for single-unit characters vs. lead units vs. trail units are completely disjoint.
  55. * This means that it is easy to determine character (code point) boundaries from
  56. * random offsets in the string.
  57. *
  58. * Unicode (UTF-16) string processing is optimized for the single-unit case.
  59. * Although it is important to support supplementary characters
  60. * (which use pairs of lead/trail code units called "surrogates"),
  61. * their occurrence is rare. Almost all characters in modern use require only
  62. * a single UChar code unit (i.e., their code point values are <=0xffff).
  63. *
  64. * For more details see the User Guide Strings chapter (http://icu-project.org/userguide/strings.html).
  65. * For a discussion of the handling of unpaired surrogates see also
  66. * Jitterbug 2145 and its icu mailing list proposal on 2002-sep-18.
  67. */
  68. /**
  69. * \defgroup ustring_ustrlen String Length
  70. * \ingroup ustring_strlen
  71. */
  72. /*@{*/
  73. /**
  74. * Determine the length of an array of UChar.
  75. *
  76. * @param s The array of UChars, NULL (U+0000) terminated.
  77. * @return The number of UChars in <code>chars</code>, minus the terminator.
  78. * @stable ICU 2.0
  79. */
  80. U_STABLE int32_t U_EXPORT2
  81. u_strlen(const UChar *s);
  82. /*@}*/
  83. /**
  84. * Count Unicode code points in the length UChar code units of the string.
  85. * A code point may occupy either one or two UChar code units.
  86. * Counting code points involves reading all code units.
  87. *
  88. * This functions is basically the inverse of the U16_FWD_N() macro (see utf.h).
  89. *
  90. * @param s The input string.
  91. * @param length The number of UChar code units to be checked, or -1 to count all
  92. * code points before the first NUL (U+0000).
  93. * @return The number of code points in the specified code units.
  94. * @stable ICU 2.0
  95. */
  96. U_STABLE int32_t U_EXPORT2
  97. u_countChar32(const UChar *s, int32_t length);
  98. /**
  99. * Check if the string contains more Unicode code points than a certain number.
  100. * This is more efficient than counting all code points in the entire string
  101. * and comparing that number with a threshold.
  102. * This function may not need to scan the string at all if the length is known
  103. * (not -1 for NUL-termination) and falls within a certain range, and
  104. * never needs to count more than 'number+1' code points.
  105. * Logically equivalent to (u_countChar32(s, length)>number).
  106. * A Unicode code point may occupy either one or two UChar code units.
  107. *
  108. * @param s The input string.
  109. * @param length The length of the string, or -1 if it is NUL-terminated.
  110. * @param number The number of code points in the string is compared against
  111. * the 'number' parameter.
  112. * @return Boolean value for whether the string contains more Unicode code points
  113. * than 'number'. Same as (u_countChar32(s, length)>number).
  114. * @stable ICU 2.4
  115. */
  116. U_STABLE UBool U_EXPORT2
  117. u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number);
  118. /**
  119. * Concatenate two ustrings. Appends a copy of <code>src</code>,
  120. * including the null terminator, to <code>dst</code>. The initial copied
  121. * character from <code>src</code> overwrites the null terminator in <code>dst</code>.
  122. *
  123. * @param dst The destination string.
  124. * @param src The source string.
  125. * @return A pointer to <code>dst</code>.
  126. * @stable ICU 2.0
  127. */
  128. U_STABLE UChar* U_EXPORT2
  129. u_strcat(UChar *dst,
  130. const UChar *src);
  131. /**
  132. * Concatenate two ustrings.
  133. * Appends at most <code>n</code> characters from <code>src</code> to <code>dst</code>.
  134. * Adds a terminating NUL.
  135. * If src is too long, then only <code>n-1</code> characters will be copied
  136. * before the terminating NUL.
  137. * If <code>n&lt;=0</code> then dst is not modified.
  138. *
  139. * @param dst The destination string.
  140. * @param src The source string.
  141. * @param n The maximum number of characters to append.
  142. * @return A pointer to <code>dst</code>.
  143. * @stable ICU 2.0
  144. */
  145. U_STABLE UChar* U_EXPORT2
  146. u_strncat(UChar *dst,
  147. const UChar *src,
  148. int32_t n);
  149. /**
  150. * Find the first occurrence of a substring in a string.
  151. * The substring is found at code point boundaries.
  152. * That means that if the substring begins with
  153. * a trail surrogate or ends with a lead surrogate,
  154. * then it is found only if these surrogates stand alone in the text.
  155. * Otherwise, the substring edge units would be matched against
  156. * halves of surrogate pairs.
  157. *
  158. * @param s The string to search (NUL-terminated).
  159. * @param substring The substring to find (NUL-terminated).
  160. * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
  161. * or <code>s</code> itself if the <code>substring</code> is empty,
  162. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  163. * @stable ICU 2.0
  164. *
  165. * @see u_strrstr
  166. * @see u_strFindFirst
  167. * @see u_strFindLast
  168. */
  169. U_STABLE UChar * U_EXPORT2
  170. u_strstr(const UChar *s, const UChar *substring);
  171. /**
  172. * Find the first occurrence of a substring in a string.
  173. * The substring is found at code point boundaries.
  174. * That means that if the substring begins with
  175. * a trail surrogate or ends with a lead surrogate,
  176. * then it is found only if these surrogates stand alone in the text.
  177. * Otherwise, the substring edge units would be matched against
  178. * halves of surrogate pairs.
  179. *
  180. * @param s The string to search.
  181. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
  182. * @param substring The substring to find (NUL-terminated).
  183. * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
  184. * @return A pointer to the first occurrence of <code>substring</code> in <code>s</code>,
  185. * or <code>s</code> itself if the <code>substring</code> is empty,
  186. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  187. * @stable ICU 2.4
  188. *
  189. * @see u_strstr
  190. * @see u_strFindLast
  191. */
  192. U_STABLE UChar * U_EXPORT2
  193. u_strFindFirst(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
  194. /**
  195. * Find the first occurrence of a BMP code point in a string.
  196. * A surrogate code point is found only if its match in the text is not
  197. * part of a surrogate pair.
  198. * A NUL character is found at the string terminator.
  199. *
  200. * @param s The string to search (NUL-terminated).
  201. * @param c The BMP code point to find.
  202. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  203. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  204. * @stable ICU 2.0
  205. *
  206. * @see u_strchr32
  207. * @see u_memchr
  208. * @see u_strstr
  209. * @see u_strFindFirst
  210. */
  211. U_STABLE UChar * U_EXPORT2
  212. u_strchr(const UChar *s, UChar c);
  213. /**
  214. * Find the first occurrence of a code point in a string.
  215. * A surrogate code point is found only if its match in the text is not
  216. * part of a surrogate pair.
  217. * A NUL character is found at the string terminator.
  218. *
  219. * @param s The string to search (NUL-terminated).
  220. * @param c The code point to find.
  221. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  222. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  223. * @stable ICU 2.0
  224. *
  225. * @see u_strchr
  226. * @see u_memchr32
  227. * @see u_strstr
  228. * @see u_strFindFirst
  229. */
  230. U_STABLE UChar * U_EXPORT2
  231. u_strchr32(const UChar *s, UChar32 c);
  232. /**
  233. * Find the last occurrence of a substring in a string.
  234. * The substring is found at code point boundaries.
  235. * That means that if the substring begins with
  236. * a trail surrogate or ends with a lead surrogate,
  237. * then it is found only if these surrogates stand alone in the text.
  238. * Otherwise, the substring edge units would be matched against
  239. * halves of surrogate pairs.
  240. *
  241. * @param s The string to search (NUL-terminated).
  242. * @param substring The substring to find (NUL-terminated).
  243. * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
  244. * or <code>s</code> itself if the <code>substring</code> is empty,
  245. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  246. * @stable ICU 2.4
  247. *
  248. * @see u_strstr
  249. * @see u_strFindFirst
  250. * @see u_strFindLast
  251. */
  252. U_STABLE UChar * U_EXPORT2
  253. u_strrstr(const UChar *s, const UChar *substring);
  254. /**
  255. * Find the last occurrence of a substring in a string.
  256. * The substring is found at code point boundaries.
  257. * That means that if the substring begins with
  258. * a trail surrogate or ends with a lead surrogate,
  259. * then it is found only if these surrogates stand alone in the text.
  260. * Otherwise, the substring edge units would be matched against
  261. * halves of surrogate pairs.
  262. *
  263. * @param s The string to search.
  264. * @param length The length of s (number of UChars), or -1 if it is NUL-terminated.
  265. * @param substring The substring to find (NUL-terminated).
  266. * @param subLength The length of substring (number of UChars), or -1 if it is NUL-terminated.
  267. * @return A pointer to the last occurrence of <code>substring</code> in <code>s</code>,
  268. * or <code>s</code> itself if the <code>substring</code> is empty,
  269. * or <code>NULL</code> if <code>substring</code> is not in <code>s</code>.
  270. * @stable ICU 2.4
  271. *
  272. * @see u_strstr
  273. * @see u_strFindLast
  274. */
  275. U_STABLE UChar * U_EXPORT2
  276. u_strFindLast(const UChar *s, int32_t length, const UChar *substring, int32_t subLength);
  277. /**
  278. * Find the last occurrence of a BMP code point in a string.
  279. * A surrogate code point is found only if its match in the text is not
  280. * part of a surrogate pair.
  281. * A NUL character is found at the string terminator.
  282. *
  283. * @param s The string to search (NUL-terminated).
  284. * @param c The BMP code point to find.
  285. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  286. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  287. * @stable ICU 2.4
  288. *
  289. * @see u_strrchr32
  290. * @see u_memrchr
  291. * @see u_strrstr
  292. * @see u_strFindLast
  293. */
  294. U_STABLE UChar * U_EXPORT2
  295. u_strrchr(const UChar *s, UChar c);
  296. /**
  297. * Find the last occurrence of a code point in a string.
  298. * A surrogate code point is found only if its match in the text is not
  299. * part of a surrogate pair.
  300. * A NUL character is found at the string terminator.
  301. *
  302. * @param s The string to search (NUL-terminated).
  303. * @param c The code point to find.
  304. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  305. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  306. * @stable ICU 2.4
  307. *
  308. * @see u_strrchr
  309. * @see u_memchr32
  310. * @see u_strrstr
  311. * @see u_strFindLast
  312. */
  313. U_STABLE UChar * U_EXPORT2
  314. u_strrchr32(const UChar *s, UChar32 c);
  315. /**
  316. * Locates the first occurrence in the string <code>string</code> of any of the characters
  317. * in the string <code>matchSet</code>.
  318. * Works just like C's strpbrk but with Unicode.
  319. *
  320. * @param string The string in which to search, NUL-terminated.
  321. * @param matchSet A NUL-terminated string defining a set of code points
  322. * for which to search in the text string.
  323. * @return A pointer to the character in <code>string</code> that matches one of the
  324. * characters in <code>matchSet</code>, or NULL if no such character is found.
  325. * @stable ICU 2.0
  326. */
  327. U_STABLE UChar * U_EXPORT2
  328. u_strpbrk(const UChar *string, const UChar *matchSet);
  329. /**
  330. * Returns the number of consecutive characters in <code>string</code>,
  331. * beginning with the first, that do not occur somewhere in <code>matchSet</code>.
  332. * Works just like C's strcspn but with Unicode.
  333. *
  334. * @param string The string in which to search, NUL-terminated.
  335. * @param matchSet A NUL-terminated string defining a set of code points
  336. * for which to search in the text string.
  337. * @return The number of initial characters in <code>string</code> that do not
  338. * occur in <code>matchSet</code>.
  339. * @see u_strspn
  340. * @stable ICU 2.0
  341. */
  342. U_STABLE int32_t U_EXPORT2
  343. u_strcspn(const UChar *string, const UChar *matchSet);
  344. /**
  345. * Returns the number of consecutive characters in <code>string</code>,
  346. * beginning with the first, that occur somewhere in <code>matchSet</code>.
  347. * Works just like C's strspn but with Unicode.
  348. *
  349. * @param string The string in which to search, NUL-terminated.
  350. * @param matchSet A NUL-terminated string defining a set of code points
  351. * for which to search in the text string.
  352. * @return The number of initial characters in <code>string</code> that do
  353. * occur in <code>matchSet</code>.
  354. * @see u_strcspn
  355. * @stable ICU 2.0
  356. */
  357. U_STABLE int32_t U_EXPORT2
  358. u_strspn(const UChar *string, const UChar *matchSet);
  359. /**
  360. * The string tokenizer API allows an application to break a string into
  361. * tokens. Unlike strtok(), the saveState (the current pointer within the
  362. * original string) is maintained in saveState. In the first call, the
  363. * argument src is a pointer to the string. In subsequent calls to
  364. * return successive tokens of that string, src must be specified as
  365. * NULL. The value saveState is set by this function to maintain the
  366. * function's position within the string, and on each subsequent call
  367. * you must give this argument the same variable. This function does
  368. * handle surrogate pairs. This function is similar to the strtok_r()
  369. * the POSIX Threads Extension (1003.1c-1995) version.
  370. *
  371. * @param src String containing token(s). This string will be modified.
  372. * After the first call to u_strtok_r(), this argument must
  373. * be NULL to get to the next token.
  374. * @param delim Set of delimiter characters (Unicode code points).
  375. * @param saveState The current pointer within the original string,
  376. * which is set by this function. The saveState
  377. * parameter should the address of a local variable of type
  378. * UChar *. (i.e. defined "Uhar *myLocalSaveState" and use
  379. * &myLocalSaveState for this parameter).
  380. * @return A pointer to the next token found in src, or NULL
  381. * when there are no more tokens.
  382. * @stable ICU 2.0
  383. */
  384. U_STABLE UChar * U_EXPORT2
  385. u_strtok_r(UChar *src,
  386. const UChar *delim,
  387. UChar **saveState);
  388. /**
  389. * Compare two Unicode strings for bitwise equality (code unit order).
  390. *
  391. * @param s1 A string to compare.
  392. * @param s2 A string to compare.
  393. * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
  394. * value if <code>s1</code> is bitwise less than <code>s2,</code>; a positive
  395. * value if <code>s1</code> is bitwise greater than <code>s2</code>.
  396. * @stable ICU 2.0
  397. */
  398. U_STABLE int32_t U_EXPORT2
  399. u_strcmp(const UChar *s1,
  400. const UChar *s2);
  401. /**
  402. * Compare two Unicode strings in code point order.
  403. * See u_strCompare for details.
  404. *
  405. * @param s1 A string to compare.
  406. * @param s2 A string to compare.
  407. * @return a negative/zero/positive integer corresponding to whether
  408. * the first string is less than/equal to/greater than the second one
  409. * in code point order
  410. * @stable ICU 2.0
  411. */
  412. U_STABLE int32_t U_EXPORT2
  413. u_strcmpCodePointOrder(const UChar *s1, const UChar *s2);
  414. /**
  415. * Compare two Unicode strings (binary order).
  416. *
  417. * The comparison can be done in code unit order or in code point order.
  418. * They differ only in UTF-16 when
  419. * comparing supplementary code points (U+10000..U+10ffff)
  420. * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
  421. * In code unit order, high BMP code points sort after supplementary code points
  422. * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
  423. *
  424. * This functions works with strings of different explicitly specified lengths
  425. * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
  426. * NUL-terminated strings are possible with length arguments of -1.
  427. *
  428. * @param s1 First source string.
  429. * @param length1 Length of first source string, or -1 if NUL-terminated.
  430. *
  431. * @param s2 Second source string.
  432. * @param length2 Length of second source string, or -1 if NUL-terminated.
  433. *
  434. * @param codePointOrder Choose between code unit order (FALSE)
  435. * and code point order (TRUE).
  436. *
  437. * @return <0 or 0 or >0 as usual for string comparisons
  438. *
  439. * @stable ICU 2.2
  440. */
  441. U_STABLE int32_t U_EXPORT2
  442. u_strCompare(const UChar *s1, int32_t length1,
  443. const UChar *s2, int32_t length2,
  444. UBool codePointOrder);
  445. /**
  446. * Compare two Unicode strings (binary order)
  447. * as presented by UCharIterator objects.
  448. * Works otherwise just like u_strCompare().
  449. *
  450. * Both iterators are reset to their start positions.
  451. * When the function returns, it is undefined where the iterators
  452. * have stopped.
  453. *
  454. * @param iter1 First source string iterator.
  455. * @param iter2 Second source string iterator.
  456. * @param codePointOrder Choose between code unit order (FALSE)
  457. * and code point order (TRUE).
  458. *
  459. * @return <0 or 0 or >0 as usual for string comparisons
  460. *
  461. * @see u_strCompare
  462. *
  463. * @stable ICU 2.6
  464. */
  465. U_STABLE int32_t U_EXPORT2
  466. u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder);
  467. #ifndef U_COMPARE_CODE_POINT_ORDER
  468. /* see also unistr.h and unorm.h */
  469. /**
  470. * Option bit for u_strCaseCompare, u_strcasecmp, unorm_compare, etc:
  471. * Compare strings in code point order instead of code unit order.
  472. * @stable ICU 2.2
  473. */
  474. #define U_COMPARE_CODE_POINT_ORDER 0x8000
  475. #endif
  476. /**
  477. * Compare two strings case-insensitively using full case folding.
  478. * This is equivalent to
  479. * u_strCompare(u_strFoldCase(s1, options),
  480. * u_strFoldCase(s2, options),
  481. * (options&U_COMPARE_CODE_POINT_ORDER)!=0).
  482. *
  483. * The comparison can be done in UTF-16 code unit order or in code point order.
  484. * They differ only when comparing supplementary code points (U+10000..U+10ffff)
  485. * to BMP code points near the end of the BMP (i.e., U+e000..U+ffff).
  486. * In code unit order, high BMP code points sort after supplementary code points
  487. * because they are stored as pairs of surrogates which are at U+d800..U+dfff.
  488. *
  489. * This functions works with strings of different explicitly specified lengths
  490. * unlike the ANSI C-like u_strcmp() and u_memcmp() etc.
  491. * NUL-terminated strings are possible with length arguments of -1.
  492. *
  493. * @param s1 First source string.
  494. * @param length1 Length of first source string, or -1 if NUL-terminated.
  495. *
  496. * @param s2 Second source string.
  497. * @param length2 Length of second source string, or -1 if NUL-terminated.
  498. *
  499. * @param options A bit set of options:
  500. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  501. * Comparison in code unit order with default case folding.
  502. *
  503. * - U_COMPARE_CODE_POINT_ORDER
  504. * Set to choose code point order instead of code unit order
  505. * (see u_strCompare for details).
  506. *
  507. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  508. *
  509. * @param pErrorCode Must be a valid pointer to an error code value,
  510. * which must not indicate a failure before the function call.
  511. *
  512. * @return <0 or 0 or >0 as usual for string comparisons
  513. *
  514. * @stable ICU 2.2
  515. */
  516. U_STABLE int32_t U_EXPORT2
  517. u_strCaseCompare(const UChar *s1, int32_t length1,
  518. const UChar *s2, int32_t length2,
  519. uint32_t options,
  520. UErrorCode *pErrorCode);
  521. /**
  522. * Compare two ustrings for bitwise equality.
  523. * Compares at most <code>n</code> characters.
  524. *
  525. * @param ucs1 A string to compare.
  526. * @param ucs2 A string to compare.
  527. * @param n The maximum number of characters to compare.
  528. * @return 0 if <code>s1</code> and <code>s2</code> are bitwise equal; a negative
  529. * value if <code>s1</code> is bitwise less than <code>s2</code>; a positive
  530. * value if <code>s1</code> is bitwise greater than <code>s2</code>.
  531. * @stable ICU 2.0
  532. */
  533. U_STABLE int32_t U_EXPORT2
  534. u_strncmp(const UChar *ucs1,
  535. const UChar *ucs2,
  536. int32_t n);
  537. /**
  538. * Compare two Unicode strings in code point order.
  539. * This is different in UTF-16 from u_strncmp() if supplementary characters are present.
  540. * For details, see u_strCompare().
  541. *
  542. * @param s1 A string to compare.
  543. * @param s2 A string to compare.
  544. * @param n The maximum number of characters to compare.
  545. * @return a negative/zero/positive integer corresponding to whether
  546. * the first string is less than/equal to/greater than the second one
  547. * in code point order
  548. * @stable ICU 2.0
  549. */
  550. U_STABLE int32_t U_EXPORT2
  551. u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n);
  552. /**
  553. * Compare two strings case-insensitively using full case folding.
  554. * This is equivalent to u_strcmp(u_strFoldCase(s1, options), u_strFoldCase(s2, options)).
  555. *
  556. * @param s1 A string to compare.
  557. * @param s2 A string to compare.
  558. * @param options A bit set of options:
  559. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  560. * Comparison in code unit order with default case folding.
  561. *
  562. * - U_COMPARE_CODE_POINT_ORDER
  563. * Set to choose code point order instead of code unit order
  564. * (see u_strCompare for details).
  565. *
  566. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  567. *
  568. * @return A negative, zero, or positive integer indicating the comparison result.
  569. * @stable ICU 2.0
  570. */
  571. U_STABLE int32_t U_EXPORT2
  572. u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options);
  573. /**
  574. * Compare two strings case-insensitively using full case folding.
  575. * This is equivalent to u_strcmp(u_strFoldCase(s1, at most n, options),
  576. * u_strFoldCase(s2, at most n, options)).
  577. *
  578. * @param s1 A string to compare.
  579. * @param s2 A string to compare.
  580. * @param n The maximum number of characters each string to case-fold and then compare.
  581. * @param options A bit set of options:
  582. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  583. * Comparison in code unit order with default case folding.
  584. *
  585. * - U_COMPARE_CODE_POINT_ORDER
  586. * Set to choose code point order instead of code unit order
  587. * (see u_strCompare for details).
  588. *
  589. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  590. *
  591. * @return A negative, zero, or positive integer indicating the comparison result.
  592. * @stable ICU 2.0
  593. */
  594. U_STABLE int32_t U_EXPORT2
  595. u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options);
  596. /**
  597. * Compare two strings case-insensitively using full case folding.
  598. * This is equivalent to u_strcmp(u_strFoldCase(s1, n, options),
  599. * u_strFoldCase(s2, n, options)).
  600. *
  601. * @param s1 A string to compare.
  602. * @param s2 A string to compare.
  603. * @param length The number of characters in each string to case-fold and then compare.
  604. * @param options A bit set of options:
  605. * - U_FOLD_CASE_DEFAULT or 0 is used for default options:
  606. * Comparison in code unit order with default case folding.
  607. *
  608. * - U_COMPARE_CODE_POINT_ORDER
  609. * Set to choose code point order instead of code unit order
  610. * (see u_strCompare for details).
  611. *
  612. * - U_FOLD_CASE_EXCLUDE_SPECIAL_I
  613. *
  614. * @return A negative, zero, or positive integer indicating the comparison result.
  615. * @stable ICU 2.0
  616. */
  617. U_STABLE int32_t U_EXPORT2
  618. u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options);
  619. /**
  620. * Copy a ustring. Adds a null terminator.
  621. *
  622. * @param dst The destination string.
  623. * @param src The source string.
  624. * @return A pointer to <code>dst</code>.
  625. * @stable ICU 2.0
  626. */
  627. U_STABLE UChar* U_EXPORT2
  628. u_strcpy(UChar *dst,
  629. const UChar *src);
  630. /**
  631. * Copy a ustring.
  632. * Copies at most <code>n</code> characters. The result will be null terminated
  633. * if the length of <code>src</code> is less than <code>n</code>.
  634. *
  635. * @param dst The destination string.
  636. * @param src The source string.
  637. * @param n The maximum number of characters to copy.
  638. * @return A pointer to <code>dst</code>.
  639. * @stable ICU 2.0
  640. */
  641. U_STABLE UChar* U_EXPORT2
  642. u_strncpy(UChar *dst,
  643. const UChar *src,
  644. int32_t n);
  645. #if !UCONFIG_NO_CONVERSION
  646. /**
  647. * Copy a byte string encoded in the default codepage to a ustring.
  648. * Adds a null terminator.
  649. * Performs a host byte to UChar conversion
  650. *
  651. * @param dst The destination string.
  652. * @param src The source string.
  653. * @return A pointer to <code>dst</code>.
  654. * @stable ICU 2.0
  655. */
  656. U_STABLE UChar* U_EXPORT2 u_uastrcpy(UChar *dst,
  657. const char *src );
  658. /**
  659. * Copy a byte string encoded in the default codepage to a ustring.
  660. * Copies at most <code>n</code> characters. The result will be null terminated
  661. * if the length of <code>src</code> is less than <code>n</code>.
  662. * Performs a host byte to UChar conversion
  663. *
  664. * @param dst The destination string.
  665. * @param src The source string.
  666. * @param n The maximum number of characters to copy.
  667. * @return A pointer to <code>dst</code>.
  668. * @stable ICU 2.0
  669. */
  670. U_STABLE UChar* U_EXPORT2 u_uastrncpy(UChar *dst,
  671. const char *src,
  672. int32_t n);
  673. /**
  674. * Copy ustring to a byte string encoded in the default codepage.
  675. * Adds a null terminator.
  676. * Performs a UChar to host byte conversion
  677. *
  678. * @param dst The destination string.
  679. * @param src The source string.
  680. * @return A pointer to <code>dst</code>.
  681. * @stable ICU 2.0
  682. */
  683. U_STABLE char* U_EXPORT2 u_austrcpy(char *dst,
  684. const UChar *src );
  685. /**
  686. * Copy ustring to a byte string encoded in the default codepage.
  687. * Copies at most <code>n</code> characters. The result will be null terminated
  688. * if the length of <code>src</code> is less than <code>n</code>.
  689. * Performs a UChar to host byte conversion
  690. *
  691. * @param dst The destination string.
  692. * @param src The source string.
  693. * @param n The maximum number of characters to copy.
  694. * @return A pointer to <code>dst</code>.
  695. * @stable ICU 2.0
  696. */
  697. U_STABLE char* U_EXPORT2 u_austrncpy(char *dst,
  698. const UChar *src,
  699. int32_t n );
  700. #endif
  701. /**
  702. * Synonym for memcpy(), but with UChars only.
  703. * @param dest The destination string
  704. * @param src The source string
  705. * @param count The number of characters to copy
  706. * @return A pointer to <code>dest</code>
  707. * @stable ICU 2.0
  708. */
  709. U_STABLE UChar* U_EXPORT2
  710. u_memcpy(UChar *dest, const UChar *src, int32_t count);
  711. /**
  712. * Synonym for memmove(), but with UChars only.
  713. * @param dest The destination string
  714. * @param src The source string
  715. * @param count The number of characters to move
  716. * @return A pointer to <code>dest</code>
  717. * @stable ICU 2.0
  718. */
  719. U_STABLE UChar* U_EXPORT2
  720. u_memmove(UChar *dest, const UChar *src, int32_t count);
  721. /**
  722. * Initialize <code>count</code> characters of <code>dest</code> to <code>c</code>.
  723. *
  724. * @param dest The destination string.
  725. * @param c The character to initialize the string.
  726. * @param count The maximum number of characters to set.
  727. * @return A pointer to <code>dest</code>.
  728. * @stable ICU 2.0
  729. */
  730. U_STABLE UChar* U_EXPORT2
  731. u_memset(UChar *dest, UChar c, int32_t count);
  732. /**
  733. * Compare the first <code>count</code> UChars of each buffer.
  734. *
  735. * @param buf1 The first string to compare.
  736. * @param buf2 The second string to compare.
  737. * @param count The maximum number of UChars to compare.
  738. * @return When buf1 < buf2, a negative number is returned.
  739. * When buf1 == buf2, 0 is returned.
  740. * When buf1 > buf2, a positive number is returned.
  741. * @stable ICU 2.0
  742. */
  743. U_STABLE int32_t U_EXPORT2
  744. u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count);
  745. /**
  746. * Compare two Unicode strings in code point order.
  747. * This is different in UTF-16 from u_memcmp() if supplementary characters are present.
  748. * For details, see u_strCompare().
  749. *
  750. * @param s1 A string to compare.
  751. * @param s2 A string to compare.
  752. * @param count The maximum number of characters to compare.
  753. * @return a negative/zero/positive integer corresponding to whether
  754. * the first string is less than/equal to/greater than the second one
  755. * in code point order
  756. * @stable ICU 2.0
  757. */
  758. U_STABLE int32_t U_EXPORT2
  759. u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count);
  760. /**
  761. * Find the first occurrence of a BMP code point in a string.
  762. * A surrogate code point is found only if its match in the text is not
  763. * part of a surrogate pair.
  764. * A NUL character is found at the string terminator.
  765. *
  766. * @param s The string to search (contains <code>count</code> UChars).
  767. * @param c The BMP code point to find.
  768. * @param count The length of the string.
  769. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  770. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  771. * @stable ICU 2.0
  772. *
  773. * @see u_strchr
  774. * @see u_memchr32
  775. * @see u_strFindFirst
  776. */
  777. U_STABLE UChar* U_EXPORT2
  778. u_memchr(const UChar *s, UChar c, int32_t count);
  779. /**
  780. * Find the first occurrence of a code point in a string.
  781. * A surrogate code point is found only if its match in the text is not
  782. * part of a surrogate pair.
  783. * A NUL character is found at the string terminator.
  784. *
  785. * @param s The string to search (contains <code>count</code> UChars).
  786. * @param c The code point to find.
  787. * @param count The length of the string.
  788. * @return A pointer to the first occurrence of <code>c</code> in <code>s</code>
  789. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  790. * @stable ICU 2.0
  791. *
  792. * @see u_strchr32
  793. * @see u_memchr
  794. * @see u_strFindFirst
  795. */
  796. U_STABLE UChar* U_EXPORT2
  797. u_memchr32(const UChar *s, UChar32 c, int32_t count);
  798. /**
  799. * Find the last occurrence of a BMP code point in a string.
  800. * A surrogate code point is found only if its match in the text is not
  801. * part of a surrogate pair.
  802. * A NUL character is found at the string terminator.
  803. *
  804. * @param s The string to search (contains <code>count</code> UChars).
  805. * @param c The BMP code point to find.
  806. * @param count The length of the string.
  807. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  808. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  809. * @stable ICU 2.4
  810. *
  811. * @see u_strrchr
  812. * @see u_memrchr32
  813. * @see u_strFindLast
  814. */
  815. U_STABLE UChar* U_EXPORT2
  816. u_memrchr(const UChar *s, UChar c, int32_t count);
  817. /**
  818. * Find the last occurrence of a code point in a string.
  819. * A surrogate code point is found only if its match in the text is not
  820. * part of a surrogate pair.
  821. * A NUL character is found at the string terminator.
  822. *
  823. * @param s The string to search (contains <code>count</code> UChars).
  824. * @param c The code point to find.
  825. * @param count The length of the string.
  826. * @return A pointer to the last occurrence of <code>c</code> in <code>s</code>
  827. * or <code>NULL</code> if <code>c</code> is not in <code>s</code>.
  828. * @stable ICU 2.4
  829. *
  830. * @see u_strrchr32
  831. * @see u_memrchr
  832. * @see u_strFindLast
  833. */
  834. U_STABLE UChar* U_EXPORT2
  835. u_memrchr32(const UChar *s, UChar32 c, int32_t count);
  836. /**
  837. * Unicode String literals in C.
  838. * We need one macro to declare a variable for the string
  839. * and to statically preinitialize it if possible,
  840. * and a second macro to dynamically intialize such a string variable if necessary.
  841. *
  842. * The macros are defined for maximum performance.
  843. * They work only for strings that contain "invariant characters", i.e.,
  844. * only latin letters, digits, and some punctuation.
  845. * See utypes.h for details.
  846. *
  847. * A pair of macros for a single string must be used with the same
  848. * parameters.
  849. * The string parameter must be a C string literal.
  850. * The length of the string, not including the terminating
  851. * <code>NUL</code>, must be specified as a constant.
  852. * The U_STRING_DECL macro should be invoked exactly once for one
  853. * such string variable before it is used.
  854. *
  855. * Usage:
  856. * <pre>
  857. * U_STRING_DECL(ustringVar1, "Quick-Fox 2", 11);
  858. * U_STRING_DECL(ustringVar2, "jumps 5%", 8);
  859. * static UBool didInit=FALSE;
  860. *
  861. * int32_t function() {
  862. * if(!didInit) {
  863. * U_STRING_INIT(ustringVar1, "Quick-Fox 2", 11);
  864. * U_STRING_INIT(ustringVar2, "jumps 5%", 8);
  865. * didInit=TRUE;
  866. * }
  867. * return u_strcmp(ustringVar1, ustringVar2);
  868. * }
  869. * </pre>
  870. *
  871. * Note that the macros will NOT consistently work if their argument is another #define.
  872. * The following will not work on all platforms, don't use it.
  873. *
  874. * <pre>
  875. * #define GLUCK "Mr. Gluck"
  876. * U_STRING_DECL(var, GLUCK, 9)
  877. * U_STRING_INIT(var, GLUCK, 9)
  878. * </pre>
  879. *
  880. * Instead, use the string literal "Mr. Gluck" as the argument to both macro
  881. * calls.
  882. *
  883. *
  884. * @stable ICU 2.0
  885. */
  886. #if defined(U_DECLARE_UTF16)
  887. # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=U_DECLARE_UTF16(cs)
  888. /**@stable ICU 2.0 */
  889. # define U_STRING_INIT(var, cs, length)
  890. #elif U_SIZEOF_WCHAR_T==U_SIZEOF_UCHAR && (U_CHARSET_FAMILY==U_ASCII_FAMILY || (U_SIZEOF_UCHAR == 2 && defined(U_WCHAR_IS_UTF16)))
  891. # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]=L ## cs
  892. /**@stable ICU 2.0 */
  893. # define U_STRING_INIT(var, cs, length)
  894. #elif U_SIZEOF_UCHAR==1 && U_CHARSET_FAMILY==U_ASCII_FAMILY
  895. # define U_STRING_DECL(var, cs, length) static const UChar var[(length)+1]={ (const UChar *)cs }
  896. /**@stable ICU 2.0 */
  897. # define U_STRING_INIT(var, cs, length)
  898. #else
  899. # define U_STRING_DECL(var, cs, length) static UChar var[(length)+1]
  900. /**@stable ICU 2.0 */
  901. # define U_STRING_INIT(var, cs, length) u_charsToUChars(cs, var, length+1)
  902. #endif
  903. /**
  904. * Unescape a string of characters and write the resulting
  905. * Unicode characters to the destination buffer. The following escape
  906. * sequences are recognized:
  907. *
  908. * \\uhhhh 4 hex digits; h in [0-9A-Fa-f]
  909. * \\Uhhhhhhhh 8 hex digits
  910. * \\xhh 1-2 hex digits
  911. * \\x{h...} 1-8 hex digits
  912. * \\ooo 1-3 octal digits; o in [0-7]
  913. * \\cX control-X; X is masked with 0x1F
  914. *
  915. * as well as the standard ANSI C escapes:
  916. *
  917. * \\a => U+0007, \\b => U+0008, \\t => U+0009, \\n => U+000A,
  918. * \\v => U+000B, \\f => U+000C, \\r => U+000D, \\e => U+001B,
  919. * \\&quot; => U+0022, \\' => U+0027, \\? => U+003F, \\\\ => U+005C
  920. *
  921. * Anything else following a backslash is generically escaped. For
  922. * example, "[a\\-z]" returns "[a-z]".
  923. *
  924. * If an escape sequence is ill-formed, this method returns an empty
  925. * string. An example of an ill-formed sequence is "\\u" followed by
  926. * fewer than 4 hex digits.
  927. *
  928. * The above characters are recognized in the compiler's codepage,
  929. * that is, they are coded as 'u', '\\', etc. Characters that are
  930. * not parts of escape sequences are converted using u_charsToUChars().
  931. *
  932. * This function is similar to UnicodeString::unescape() but not
  933. * identical to it. The latter takes a source UnicodeString, so it
  934. * does escape recognition but no conversion.
  935. *
  936. * @param src a zero-terminated string of invariant characters
  937. * @param dest pointer to buffer to receive converted and unescaped
  938. * text and, if there is room, a zero terminator. May be NULL for
  939. * preflighting, in which case no UChars will be written, but the
  940. * return value will still be valid. On error, an empty string is
  941. * stored here (if possible).
  942. * @param destCapacity the number of UChars that may be written at
  943. * dest. Ignored if dest == NULL.
  944. * @return the length of unescaped string.
  945. * @see u_unescapeAt
  946. * @see UnicodeString#unescape()
  947. * @see UnicodeString#unescapeAt()
  948. * @stable ICU 2.0
  949. */
  950. U_STABLE int32_t U_EXPORT2
  951. u_unescape(const char *src,
  952. UChar *dest, int32_t destCapacity);
  953. U_CDECL_BEGIN
  954. /**
  955. * Callback function for u_unescapeAt() that returns a character of
  956. * the source text given an offset and a context pointer. The context
  957. * pointer will be whatever is passed into u_unescapeAt().
  958. *
  959. * @param offset pointer to the offset that will be passed to u_unescapeAt().
  960. * @param context an opaque pointer passed directly into u_unescapeAt()
  961. * @return the character represented by the escape sequence at
  962. * offset
  963. * @see u_unescapeAt
  964. * @stable ICU 2.0
  965. */
  966. typedef UChar (U_CALLCONV *UNESCAPE_CHAR_AT)(int32_t offset, void *context);
  967. U_CDECL_END
  968. /**
  969. * Unescape a single sequence. The character at offset-1 is assumed
  970. * (without checking) to be a backslash. This method takes a callback
  971. * pointer to a function that returns the UChar at a given offset. By
  972. * varying this callback, ICU functions are able to unescape char*
  973. * strings, UnicodeString objects, and UFILE pointers.
  974. *
  975. * If offset is out of range, or if the escape sequence is ill-formed,
  976. * (UChar32)0xFFFFFFFF is returned. See documentation of u_unescape()
  977. * for a list of recognized sequences.
  978. *
  979. * @param charAt callback function that returns a UChar of the source
  980. * text given an offset and a context pointer.
  981. * @param offset pointer to the offset that will be passed to charAt.
  982. * The offset value will be updated upon return to point after the
  983. * last parsed character of the escape sequence. On error the offset
  984. * is unchanged.
  985. * @param length the number of characters in the source text. The
  986. * last character of the source text is considered to be at offset
  987. * length-1.
  988. * @param context an opaque pointer passed directly into charAt.
  989. * @return the character represented by the escape sequence at
  990. * offset, or (UChar32)0xFFFFFFFF on error.
  991. * @see u_unescape()
  992. * @see UnicodeString#unescape()
  993. * @see UnicodeString#unescapeAt()
  994. * @stable ICU 2.0
  995. */
  996. U_STABLE UChar32 U_EXPORT2
  997. u_unescapeAt(UNESCAPE_CHAR_AT charAt,
  998. int32_t *offset,
  999. int32_t length,
  1000. void *context);
  1001. /**
  1002. * Uppercase the characters in a string.
  1003. * Casing is locale-dependent and context-sensitive.
  1004. * The result may be longer or shorter than the original.
  1005. * The source string and the destination buffer are allowed to overlap.
  1006. *
  1007. * @param dest A buffer for the result string. The result will be zero-terminated if
  1008. * the buffer is large enough.
  1009. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1010. * dest may be NULL and the function will only return the length of the result
  1011. * without writing any of the result string.
  1012. * @param src The original string
  1013. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1014. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1015. * @param pErrorCode Must be a valid pointer to an error code value,
  1016. * which must not indicate a failure before the function call.
  1017. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1018. * only some of the result was written to the destination buffer.
  1019. * @stable ICU 2.0
  1020. */
  1021. U_STABLE int32_t U_EXPORT2
  1022. u_strToUpper(UChar *dest, int32_t destCapacity,
  1023. const UChar *src, int32_t srcLength,
  1024. const char *locale,
  1025. UErrorCode *pErrorCode);
  1026. /**
  1027. * Lowercase the characters in a string.
  1028. * Casing is locale-dependent and context-sensitive.
  1029. * The result may be longer or shorter than the original.
  1030. * The source string and the destination buffer are allowed to overlap.
  1031. *
  1032. * @param dest A buffer for the result string. The result will be zero-terminated if
  1033. * the buffer is large enough.
  1034. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1035. * dest may be NULL and the function will only return the length of the result
  1036. * without writing any of the result string.
  1037. * @param src The original string
  1038. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1039. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1040. * @param pErrorCode Must be a valid pointer to an error code value,
  1041. * which must not indicate a failure before the function call.
  1042. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1043. * only some of the result was written to the destination buffer.
  1044. * @stable ICU 2.0
  1045. */
  1046. U_STABLE int32_t U_EXPORT2
  1047. u_strToLower(UChar *dest, int32_t destCapacity,
  1048. const UChar *src, int32_t srcLength,
  1049. const char *locale,
  1050. UErrorCode *pErrorCode);
  1051. #if !UCONFIG_NO_BREAK_ITERATION
  1052. /**
  1053. * Titlecase a string.
  1054. * Casing is locale-dependent and context-sensitive.
  1055. * Titlecasing uses a break iterator to find the first characters of words
  1056. * that are to be titlecased. It titlecases those characters and lowercases
  1057. * all others.
  1058. *
  1059. * The titlecase break iterator can be provided to customize for arbitrary
  1060. * styles, using rules and dictionaries beyond the standard iterators.
  1061. * It may be more efficient to always provide an iterator to avoid
  1062. * opening and closing one for each string.
  1063. * The standard titlecase iterator for the root locale implements the
  1064. * algorithm of Unicode TR 21.
  1065. *
  1066. * This function uses only the setText(), first() and next() methods of the
  1067. * provided break iterator.
  1068. *
  1069. * The result may be longer or shorter than the original.
  1070. * The source string and the destination buffer are allowed to overlap.
  1071. *
  1072. * @param dest A buffer for the result string. The result will be zero-terminated if
  1073. * the buffer is large enough.
  1074. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1075. * dest may be NULL and the function will only return the length of the result
  1076. * without writing any of the result string.
  1077. * @param src The original string
  1078. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1079. * @param titleIter A break iterator to find the first characters of words
  1080. * that are to be titlecased.
  1081. * If none is provided (NULL), then a standard titlecase
  1082. * break iterator is opened.
  1083. * @param locale The locale to consider, or "" for the root locale or NULL for the default locale.
  1084. * @param pErrorCode Must be a valid pointer to an error code value,
  1085. * which must not indicate a failure before the function call.
  1086. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1087. * only some of the result was written to the destination buffer.
  1088. * @stable ICU 2.1
  1089. */
  1090. U_STABLE int32_t U_EXPORT2
  1091. u_strToTitle(UChar *dest, int32_t destCapacity,
  1092. const UChar *src, int32_t srcLength,
  1093. UBreakIterator *titleIter,
  1094. const char *locale,
  1095. UErrorCode *pErrorCode);
  1096. #endif
  1097. /**
  1098. * Case-fold the characters in a string.
  1099. * Case-folding is locale-independent and not context-sensitive,
  1100. * but there is an option for whether to include or exclude mappings for dotted I
  1101. * and dotless i that are marked with 'I' in CaseFolding.txt.
  1102. * The result may be longer or shorter than the original.
  1103. * The source string and the destination buffer are allowed to overlap.
  1104. *
  1105. * @param dest A buffer for the result string. The result will be zero-terminated if
  1106. * the buffer is large enough.
  1107. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1108. * dest may be NULL and the function will only return the length of the result
  1109. * without writing any of the result string.
  1110. * @param src The original string
  1111. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1112. * @param options Either U_FOLD_CASE_DEFAULT or U_FOLD_CASE_EXCLUDE_SPECIAL_I
  1113. * @param pErrorCode Must be a valid pointer to an error code value,
  1114. * which must not indicate a failure before the function call.
  1115. * @return The length of the result string. It may be greater than destCapacity. In that case,
  1116. * only some of the result was written to the destination buffer.
  1117. * @stable ICU 2.0
  1118. */
  1119. U_STABLE int32_t U_EXPORT2
  1120. u_strFoldCase(UChar *dest, int32_t destCapacity,
  1121. const UChar *src, int32_t srcLength,
  1122. uint32_t options,
  1123. UErrorCode *pErrorCode);
  1124. #if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION
  1125. /**
  1126. * Convert a UTF-16 string to a wchar_t string.
  1127. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
  1128. * this function simply calls the fast, dedicated function for that.
  1129. * Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed.
  1130. *
  1131. * @param dest A buffer for the result string. The result will be zero-terminated if
  1132. * the buffer is large enough.
  1133. * @param destCapacity The size of the buffer (number of wchar_t's). If it is 0, then
  1134. * dest may be NULL and the function will only return the length of the
  1135. * result without writing any of the result string (pre-flighting).
  1136. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1137. * pDestLength!=NULL then *pDestLength is always set to the
  1138. * number of output units corresponding to the transformation of
  1139. * all the input units, even in case of a buffer overflow.
  1140. * @param src The original source string
  1141. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1142. * @param pErrorCode Must be a valid pointer to an error code value,
  1143. * which must not indicate a failure before the function call.
  1144. * @return The pointer to destination buffer.
  1145. * @stable ICU 2.0
  1146. */
  1147. U_STABLE wchar_t* U_EXPORT2
  1148. u_strToWCS(wchar_t *dest,
  1149. int32_t destCapacity,
  1150. int32_t *pDestLength,
  1151. const UChar *src,
  1152. int32_t srcLength,
  1153. UErrorCode *pErrorCode);
  1154. /**
  1155. * Convert a wchar_t string to UTF-16.
  1156. * If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
  1157. * this function simply calls the fast, dedicated function for that.
  1158. * Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed.
  1159. *
  1160. * @param dest A buffer for the result string. The result will be zero-terminated if
  1161. * the buffer is large enough.
  1162. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1163. * dest may be NULL and the function will only return the length of the
  1164. * result without writing any of the result string (pre-flighting).
  1165. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1166. * pDestLength!=NULL then *pDestLength is always set to the
  1167. * number of output units corresponding to the transformation of
  1168. * all the input units, even in case of a buffer overflow.
  1169. * @param src The original source string
  1170. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1171. * @param pErrorCode Must be a valid pointer to an error code value,
  1172. * which must not indicate a failure before the function call.
  1173. * @return The pointer to destination buffer.
  1174. * @stable ICU 2.0
  1175. */
  1176. U_STABLE UChar* U_EXPORT2
  1177. u_strFromWCS(UChar *dest,
  1178. int32_t destCapacity,
  1179. int32_t *pDestLength,
  1180. const wchar_t *src,
  1181. int32_t srcLength,
  1182. UErrorCode *pErrorCode);
  1183. #endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */
  1184. /**
  1185. * Convert a UTF-16 string to UTF-8.
  1186. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1187. *
  1188. * @param dest A buffer for the result string. The result will be zero-terminated if
  1189. * the buffer is large enough.
  1190. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1191. * dest may be NULL and the function will only return the length of the
  1192. * result without writing any of the result string (pre-flighting).
  1193. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1194. * pDestLength!=NULL then *pDestLength is always set to the
  1195. * number of output units corresponding to the transformation of
  1196. * all the input units, even in case of a buffer overflow.
  1197. * @param src The original source string
  1198. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1199. * @param pErrorCode Must be a valid pointer to an error code value,
  1200. * which must not indicate a failure before the function call.
  1201. * @return The pointer to destination buffer.
  1202. * @stable ICU 2.0
  1203. * @see u_strToUTF8WithSub
  1204. * @see u_strFromUTF8
  1205. */
  1206. U_STABLE char* U_EXPORT2
  1207. u_strToUTF8(char *dest,
  1208. int32_t destCapacity,
  1209. int32_t *pDestLength,
  1210. const UChar *src,
  1211. int32_t srcLength,
  1212. UErrorCode *pErrorCode);
  1213. /**
  1214. * Convert a UTF-8 string to UTF-16.
  1215. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1216. *
  1217. * @param dest A buffer for the result string. The result will be zero-terminated if
  1218. * the buffer is large enough.
  1219. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1220. * dest may be NULL and the function will only return the length of the
  1221. * result without writing any of the result string (pre-flighting).
  1222. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1223. * pDestLength!=NULL then *pDestLength is always set to the
  1224. * number of output units corresponding to the transformation of
  1225. * all the input units, even in case of a buffer overflow.
  1226. * @param src The original source string
  1227. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1228. * @param pErrorCode Must be a valid pointer to an error code value,
  1229. * which must not indicate a failure before the function call.
  1230. * @return The pointer to destination buffer.
  1231. * @stable ICU 2.0
  1232. * @see u_strFromUTF8WithSub
  1233. * @see u_strFromUTF8Lenient
  1234. */
  1235. U_STABLE UChar* U_EXPORT2
  1236. u_strFromUTF8(UChar *dest,
  1237. int32_t destCapacity,
  1238. int32_t *pDestLength,
  1239. const char *src,
  1240. int32_t srcLength,
  1241. UErrorCode *pErrorCode);
  1242. /**
  1243. * Convert a UTF-16 string to UTF-8.
  1244. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1245. *
  1246. * Same as u_strToUTF8() except for the additional subchar which is output for
  1247. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1248. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
  1249. *
  1250. * @param dest A buffer for the result string. The result will be zero-terminated if
  1251. * the buffer is large enough.
  1252. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1253. * dest may be NULL and the function will only return the length of the
  1254. * result without writing any of the result string (pre-flighting).
  1255. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1256. * pDestLength!=NULL then *pDestLength is always set to the
  1257. * number of output units corresponding to the transformation of
  1258. * all the input units, even in case of a buffer overflow.
  1259. * @param src The original source string
  1260. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1261. * @param subchar The substitution character to use in place of an illegal input sequence,
  1262. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1263. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1264. * except for surrogate code points (U+D800..U+DFFF).
  1265. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1266. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1267. * Set to 0 if no substitutions occur or subchar<0.
  1268. * pNumSubstitutions can be NULL.
  1269. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1270. * pass the U_SUCCESS() test, or else the function returns
  1271. * immediately. Check for U_FAILURE() on output or use with
  1272. * function chaining. (See User Guide for details.)
  1273. * @return The pointer to destination buffer.
  1274. * @see u_strToUTF8
  1275. * @see u_strFromUTF8WithSub
  1276. * @stable ICU 3.6
  1277. */
  1278. U_STABLE char* U_EXPORT2
  1279. u_strToUTF8WithSub(char *dest,
  1280. int32_t destCapacity,
  1281. int32_t *pDestLength,
  1282. const UChar *src,
  1283. int32_t srcLength,
  1284. UChar32 subchar, int32_t *pNumSubstitutions,
  1285. UErrorCode *pErrorCode);
  1286. /**
  1287. * Convert a UTF-8 string to UTF-16.
  1288. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1289. *
  1290. * Same as u_strFromUTF8() except for the additional subchar which is output for
  1291. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1292. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
  1293. *
  1294. * @param dest A buffer for the result string. The result will be zero-terminated if
  1295. * the buffer is large enough.
  1296. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1297. * dest may be NULL and the function will only return the length of the
  1298. * result without writing any of the result string (pre-flighting).
  1299. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1300. * pDestLength!=NULL then *pDestLength is always set to the
  1301. * number of output units corresponding to the transformation of
  1302. * all the input units, even in case of a buffer overflow.
  1303. * @param src The original source string
  1304. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1305. * @param subchar The substitution character to use in place of an illegal input sequence,
  1306. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1307. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1308. * except for surrogate code points (U+D800..U+DFFF).
  1309. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1310. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1311. * Set to 0 if no substitutions occur or subchar<0.
  1312. * pNumSubstitutions can be NULL.
  1313. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1314. * pass the U_SUCCESS() test, or else the function returns
  1315. * immediately. Check for U_FAILURE() on output or use with
  1316. * function chaining. (See User Guide for details.)
  1317. * @return The pointer to destination buffer.
  1318. * @see u_strFromUTF8
  1319. * @see u_strFromUTF8Lenient
  1320. * @see u_strToUTF8WithSub
  1321. * @stable ICU 3.6
  1322. */
  1323. U_STABLE UChar* U_EXPORT2
  1324. u_strFromUTF8WithSub(UChar *dest,
  1325. int32_t destCapacity,
  1326. int32_t *pDestLength,
  1327. const char *src,
  1328. int32_t srcLength,
  1329. UChar32 subchar, int32_t *pNumSubstitutions,
  1330. UErrorCode *pErrorCode);
  1331. /**
  1332. * Convert a UTF-8 string to UTF-16.
  1333. *
  1334. * Same as u_strFromUTF8() except that this function is designed to be very fast,
  1335. * which it achieves by being lenient about malformed UTF-8 sequences.
  1336. * This function is intended for use in environments where UTF-8 text is
  1337. * expected to be well-formed.
  1338. *
  1339. * Its semantics are:
  1340. * - Well-formed UTF-8 text is correctly converted to well-formed UTF-16 text.
  1341. * - The function will not read beyond the input string, nor write beyond
  1342. * the destCapacity.
  1343. * - Malformed UTF-8 results in "garbage" 16-bit Unicode strings which may not
  1344. * be well-formed UTF-16.
  1345. * The function will resynchronize to valid code point boundaries
  1346. * within a small number of code points after an illegal sequence.
  1347. * - Non-shortest forms are not detected and will result in "spoofing" output.
  1348. *
  1349. * For further performance improvement, if srcLength is given (>=0),
  1350. * then it must be destCapacity>=srcLength.
  1351. *
  1352. * There is no inverse u_strToUTF8Lenient() function because there is practically
  1353. * no performance gain from not checking that a UTF-16 string is well-formed.
  1354. *
  1355. * @param dest A buffer for the result string. The result will be zero-terminated if
  1356. * the buffer is large enough.
  1357. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1358. * dest may be NULL and the function will only return the length of the
  1359. * result without writing any of the result string (pre-flighting).
  1360. * Unlike for other ICU functions, if srcLength>=0 then it
  1361. * must be destCapacity>=srcLength.
  1362. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1363. * pDestLength!=NULL then *pDestLength is always set to the
  1364. * number of output units corresponding to the transformation of
  1365. * all the input units, even in case of a buffer overflow.
  1366. * Unlike for other ICU functions, if srcLength>=0 but
  1367. * destCapacity<srcLength, then *pDestLength will be set to srcLength
  1368. * (and U_BUFFER_OVERFLOW_ERROR will be set)
  1369. * regardless of the actual result length.
  1370. * @param src The original source string
  1371. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1372. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1373. * pass the U_SUCCESS() test, or else the function returns
  1374. * immediately. Check for U_FAILURE() on output or use with
  1375. * function chaining. (See User Guide for details.)
  1376. * @return The pointer to destination buffer.
  1377. * @see u_strFromUTF8
  1378. * @see u_strFromUTF8WithSub
  1379. * @see u_strToUTF8WithSub
  1380. * @stable ICU 3.6
  1381. */
  1382. U_STABLE UChar * U_EXPORT2
  1383. u_strFromUTF8Lenient(UChar *dest,
  1384. int32_t destCapacity,
  1385. int32_t *pDestLength,
  1386. const char *src,
  1387. int32_t srcLength,
  1388. UErrorCode *pErrorCode);
  1389. /**
  1390. * Convert a UTF-16 string to UTF-32.
  1391. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1392. *
  1393. * @param dest A buffer for the result string. The result will be zero-terminated if
  1394. * the buffer is large enough.
  1395. * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
  1396. * dest may be NULL and the function will only return the length of the
  1397. * result without writing any of the result string (pre-flighting).
  1398. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1399. * pDestLength!=NULL then *pDestLength is always set to the
  1400. * number of output units corresponding to the transformation of
  1401. * all the input units, even in case of a buffer overflow.
  1402. * @param src The original source string
  1403. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1404. * @param pErrorCode Must be a valid pointer to an error code value,
  1405. * which must not indicate a failure before the function call.
  1406. * @return The pointer to destination buffer.
  1407. * @see u_strToUTF32WithSub
  1408. * @see u_strFromUTF32
  1409. * @stable ICU 2.0
  1410. */
  1411. U_STABLE UChar32* U_EXPORT2
  1412. u_strToUTF32(UChar32 *dest,
  1413. int32_t destCapacity,
  1414. int32_t *pDestLength,
  1415. const UChar *src,
  1416. int32_t srcLength,
  1417. UErrorCode *pErrorCode);
  1418. /**
  1419. * Convert a UTF-32 string to UTF-16.
  1420. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1421. *
  1422. * @param dest A buffer for the result string. The result will be zero-terminated if
  1423. * the buffer is large enough.
  1424. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1425. * dest may be NULL and the function will only return the length of the
  1426. * result without writing any of the result string (pre-flighting).
  1427. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1428. * pDestLength!=NULL then *pDestLength is always set to the
  1429. * number of output units corresponding to the transformation of
  1430. * all the input units, even in case of a buffer overflow.
  1431. * @param src The original source string
  1432. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1433. * @param pErrorCode Must be a valid pointer to an error code value,
  1434. * which must not indicate a failure before the function call.
  1435. * @return The pointer to destination buffer.
  1436. * @see u_strFromUTF32WithSub
  1437. * @see u_strToUTF32
  1438. * @stable ICU 2.0
  1439. */
  1440. U_STABLE UChar* U_EXPORT2
  1441. u_strFromUTF32(UChar *dest,
  1442. int32_t destCapacity,
  1443. int32_t *pDestLength,
  1444. const UChar32 *src,
  1445. int32_t srcLength,
  1446. UErrorCode *pErrorCode);
  1447. /**
  1448. * Convert a UTF-16 string to UTF-32.
  1449. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1450. *
  1451. * Same as u_strToUTF32() except for the additional subchar which is output for
  1452. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1453. * With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32().
  1454. *
  1455. * @param dest A buffer for the result string. The result will be zero-terminated if
  1456. * the buffer is large enough.
  1457. * @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
  1458. * dest may be NULL and the function will only return the length of the
  1459. * result without writing any of the result string (pre-flighting).
  1460. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1461. * pDestLength!=NULL then *pDestLength is always set to the
  1462. * number of output units corresponding to the transformation of
  1463. * all the input units, even in case of a buffer overflow.
  1464. * @param src The original source string
  1465. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1466. * @param subchar The substitution character to use in place of an illegal input sequence,
  1467. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1468. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1469. * except for surrogate code points (U+D800..U+DFFF).
  1470. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1471. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1472. * Set to 0 if no substitutions occur or subchar<0.
  1473. * pNumSubstitutions can be NULL.
  1474. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1475. * pass the U_SUCCESS() test, or else the function returns
  1476. * immediately. Check for U_FAILURE() on output or use with
  1477. * function chaining. (See User Guide for details.)
  1478. * @return The pointer to destination buffer.
  1479. * @see u_strToUTF32
  1480. * @see u_strFromUTF32WithSub
  1481. * @stable ICU 4.2
  1482. */
  1483. U_STABLE UChar32* U_EXPORT2
  1484. u_strToUTF32WithSub(UChar32 *dest,
  1485. int32_t destCapacity,
  1486. int32_t *pDestLength,
  1487. const UChar *src,
  1488. int32_t srcLength,
  1489. UChar32 subchar, int32_t *pNumSubstitutions,
  1490. UErrorCode *pErrorCode);
  1491. /**
  1492. * Convert a UTF-32 string to UTF-16.
  1493. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1494. *
  1495. * Same as u_strFromUTF32() except for the additional subchar which is output for
  1496. * illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
  1497. * With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32().
  1498. *
  1499. * @param dest A buffer for the result string. The result will be zero-terminated if
  1500. * the buffer is large enough.
  1501. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1502. * dest may be NULL and the function will only return the length of the
  1503. * result without writing any of the result string (pre-flighting).
  1504. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1505. * pDestLength!=NULL then *pDestLength is always set to the
  1506. * number of output units corresponding to the transformation of
  1507. * all the input units, even in case of a buffer overflow.
  1508. * @param src The original source string
  1509. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1510. * @param subchar The substitution character to use in place of an illegal input sequence,
  1511. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1512. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1513. * except for surrogate code points (U+D800..U+DFFF).
  1514. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1515. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1516. * Set to 0 if no substitutions occur or subchar<0.
  1517. * pNumSubstitutions can be NULL.
  1518. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1519. * pass the U_SUCCESS() test, or else the function returns
  1520. * immediately. Check for U_FAILURE() on output or use with
  1521. * function chaining. (See User Guide for details.)
  1522. * @return The pointer to destination buffer.
  1523. * @see u_strFromUTF32
  1524. * @see u_strToUTF32WithSub
  1525. * @stable ICU 4.2
  1526. */
  1527. U_STABLE UChar* U_EXPORT2
  1528. u_strFromUTF32WithSub(UChar *dest,
  1529. int32_t destCapacity,
  1530. int32_t *pDestLength,
  1531. const UChar32 *src,
  1532. int32_t srcLength,
  1533. UChar32 subchar, int32_t *pNumSubstitutions,
  1534. UErrorCode *pErrorCode);
  1535. /**
  1536. * Convert a 16-bit Unicode string to Java Modified UTF-8.
  1537. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
  1538. *
  1539. * This function behaves according to the documentation for Java DataOutput.writeUTF()
  1540. * except that it does not encode the output length in the destination buffer
  1541. * and does not have an output length restriction.
  1542. * See http://java.sun.com/javase/6/docs/api/java/io/DataOutput.html#writeUTF(java.lang.String)
  1543. *
  1544. * The input string need not be well-formed UTF-16.
  1545. * (Therefore there is no subchar parameter.)
  1546. *
  1547. * @param dest A buffer for the result string. The result will be zero-terminated if
  1548. * the buffer is large enough.
  1549. * @param destCapacity The size of the buffer (number of chars). If it is 0, then
  1550. * dest may be NULL and the function will only return the length of the
  1551. * result without writing any of the result string (pre-flighting).
  1552. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1553. * pDestLength!=NULL then *pDestLength is always set to the
  1554. * number of output units corresponding to the transformation of
  1555. * all the input units, even in case of a buffer overflow.
  1556. * @param src The original source string
  1557. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1558. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1559. * pass the U_SUCCESS() test, or else the function returns
  1560. * immediately. Check for U_FAILURE() on output or use with
  1561. * function chaining. (See User Guide for details.)
  1562. * @return The pointer to destination buffer.
  1563. * @stable ICU 4.4
  1564. * @see u_strToUTF8WithSub
  1565. * @see u_strFromJavaModifiedUTF8WithSub
  1566. */
  1567. U_STABLE char* U_EXPORT2
  1568. u_strToJavaModifiedUTF8(
  1569. char *dest,
  1570. int32_t destCapacity,
  1571. int32_t *pDestLength,
  1572. const UChar *src,
  1573. int32_t srcLength,
  1574. UErrorCode *pErrorCode);
  1575. /**
  1576. * Convert a Java Modified UTF-8 string to a 16-bit Unicode string.
  1577. * If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
  1578. *
  1579. * This function behaves according to the documentation for Java DataInput.readUTF()
  1580. * except that it takes a length parameter rather than
  1581. * interpreting the first two input bytes as the length.
  1582. * See http://java.sun.com/javase/6/docs/api/java/io/DataInput.html#readUTF()
  1583. *
  1584. * The output string may not be well-formed UTF-16.
  1585. *
  1586. * @param dest A buffer for the result string. The result will be zero-terminated if
  1587. * the buffer is large enough.
  1588. * @param destCapacity The size of the buffer (number of UChars). If it is 0, then
  1589. * dest may be NULL and the function will only return the length of the
  1590. * result without writing any of the result string (pre-flighting).
  1591. * @param pDestLength A pointer to receive the number of units written to the destination. If
  1592. * pDestLength!=NULL then *pDestLength is always set to the
  1593. * number of output units corresponding to the transformation of
  1594. * all the input units, even in case of a buffer overflow.
  1595. * @param src The original source string
  1596. * @param srcLength The length of the original string. If -1, then src must be zero-terminated.
  1597. * @param subchar The substitution character to use in place of an illegal input sequence,
  1598. * or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
  1599. * A substitution character can be any valid Unicode code point (up to U+10FFFF)
  1600. * except for surrogate code points (U+D800..U+DFFF).
  1601. * The recommended value is U+FFFD "REPLACEMENT CHARACTER".
  1602. * @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
  1603. * Set to 0 if no substitutions occur or subchar<0.
  1604. * pNumSubstitutions can be NULL.
  1605. * @param pErrorCode Pointer to a standard ICU error code. Its input value must
  1606. * pass the U_SUCCESS() test, or else the function returns
  1607. * immediately. Check for U_FAILURE() on output or use with
  1608. * function chaining. (See User Guide for details.)
  1609. * @return The pointer to destination buffer.
  1610. * @see u_strFromUTF8WithSub
  1611. * @see u_strFromUTF8Lenient
  1612. * @see u_strToJavaModifiedUTF8
  1613. * @stable ICU 4.4
  1614. */
  1615. U_STABLE UChar* U_EXPORT2
  1616. u_strFromJavaModifiedUTF8WithSub(
  1617. UChar *dest,
  1618. int32_t destCapacity,
  1619. int32_t *pDestLength,
  1620. const char *src,
  1621. int32_t srcLength,
  1622. UChar32 subchar, int32_t *pNumSubstitutions,
  1623. UErrorCode *pErrorCode);
  1624. #endif