uscript.h 15 KB


  1. /*
  2. **********************************************************************
  3. * Copyright (C) 1997-2010, International Business Machines
  4. * Corporation and others. All Rights Reserved.
  5. **********************************************************************
  6. *
  7. * File USCRIPT.H
  8. *
  9. * Modification History:
  10. *
  11. * Date Name Description
  12. * 07/06/2001 Ram Creation.
  13. ******************************************************************************
  14. */
  15. #ifndef USCRIPT_H
  16. #define USCRIPT_H
  17. #include "unicode/utypes.h"
  18. /**
  19. * \file
  20. * \brief C API: Unicode Script Information
  21. */
  22. /**
  23. * Constants for ISO 15924 script codes.
  24. *
  25. * Many of these script codes - those from Unicode's ScriptNames.txt -
  26. * are character property values for Unicode's Script property.
  27. * See UAX #24 Script Names (http://www.unicode.org/reports/tr24/).
  28. *
  29. * Starting with ICU 3.6, constants for most ISO 15924 script codes
  30. * are included (currently excluding private-use codes Qaaa..Qabx).
  31. * For scripts for which there are codes in ISO 15924 but which are not
  32. * used in the Unicode Character Database (UCD), there are no Unicode characters
  33. * associated with those scripts.
  34. *
  35. * For example, there are no characters that have a UCD script code of
  36. * Hans or Hant. All Han ideographs have the Hani script code.
  37. * The Hans and Hant script codes are used with CLDR data.
  38. *
  39. * ISO 15924 script codes are included for use with CLDR and similar.
  40. *
  41. * @stable ICU 2.2
  42. */
  43. typedef enum UScriptCode {
  44. USCRIPT_INVALID_CODE = -1,
  45. USCRIPT_COMMON = 0, /* Zyyy */
  46. USCRIPT_INHERITED = 1, /* Zinh */ /* "Code for inherited script", for non-spacing combining marks; also Qaai */
  47. USCRIPT_ARABIC = 2, /* Arab */
  48. USCRIPT_ARMENIAN = 3, /* Armn */
  49. USCRIPT_BENGALI = 4, /* Beng */
  50. USCRIPT_BOPOMOFO = 5, /* Bopo */
  51. USCRIPT_CHEROKEE = 6, /* Cher */
  52. USCRIPT_COPTIC = 7, /* Copt */
  53. USCRIPT_CYRILLIC = 8, /* Cyrl */
  54. USCRIPT_DESERET = 9, /* Dsrt */
  55. USCRIPT_DEVANAGARI = 10, /* Deva */
  56. USCRIPT_ETHIOPIC = 11, /* Ethi */
  57. USCRIPT_GEORGIAN = 12, /* Geor */
  58. USCRIPT_GOTHIC = 13, /* Goth */
  59. USCRIPT_GREEK = 14, /* Grek */
  60. USCRIPT_GUJARATI = 15, /* Gujr */
  61. USCRIPT_GURMUKHI = 16, /* Guru */
  62. USCRIPT_HAN = 17, /* Hani */
  63. USCRIPT_HANGUL = 18, /* Hang */
  64. USCRIPT_HEBREW = 19, /* Hebr */
  65. USCRIPT_HIRAGANA = 20, /* Hira */
  66. USCRIPT_KANNADA = 21, /* Knda */
  67. USCRIPT_KATAKANA = 22, /* Kana */
  68. USCRIPT_KHMER = 23, /* Khmr */
  69. USCRIPT_LAO = 24, /* Laoo */
  70. USCRIPT_LATIN = 25, /* Latn */
  71. USCRIPT_MALAYALAM = 26, /* Mlym */
  72. USCRIPT_MONGOLIAN = 27, /* Mong */
  73. USCRIPT_MYANMAR = 28, /* Mymr */
  74. USCRIPT_OGHAM = 29, /* Ogam */
  75. USCRIPT_OLD_ITALIC = 30, /* Ital */
  76. USCRIPT_ORIYA = 31, /* Orya */
  77. USCRIPT_RUNIC = 32, /* Runr */
  78. USCRIPT_SINHALA = 33, /* Sinh */
  79. USCRIPT_SYRIAC = 34, /* Syrc */
  80. USCRIPT_TAMIL = 35, /* Taml */
  81. USCRIPT_TELUGU = 36, /* Telu */
  82. USCRIPT_THAANA = 37, /* Thaa */
  83. USCRIPT_THAI = 38, /* Thai */
  84. USCRIPT_TIBETAN = 39, /* Tibt */
  85. /** Canadian_Aboriginal script. @stable ICU 2.6 */
  86. USCRIPT_CANADIAN_ABORIGINAL = 40, /* Cans */
  87. /** Canadian_Aboriginal script (alias). @stable ICU 2.2 */
  88. USCRIPT_UCAS = USCRIPT_CANADIAN_ABORIGINAL,
  89. USCRIPT_YI = 41, /* Yiii */
  90. USCRIPT_TAGALOG = 42, /* Tglg */
  91. USCRIPT_HANUNOO = 43, /* Hano */
  92. USCRIPT_BUHID = 44, /* Buhd */
  93. USCRIPT_TAGBANWA = 45, /* Tagb */
  94. /* New scripts in Unicode 4 @stable ICU 2.6 */
  95. USCRIPT_BRAILLE = 46, /* Brai */
  96. USCRIPT_CYPRIOT = 47, /* Cprt */
  97. USCRIPT_LIMBU = 48, /* Limb */
  98. USCRIPT_LINEAR_B = 49, /* Linb */
  99. USCRIPT_OSMANYA = 50, /* Osma */
  100. USCRIPT_SHAVIAN = 51, /* Shaw */
  101. USCRIPT_TAI_LE = 52, /* Tale */
  102. USCRIPT_UGARITIC = 53, /* Ugar */
  103. /** New script code in Unicode 4.0.1 @stable ICU 3.0 */
  104. USCRIPT_KATAKANA_OR_HIRAGANA = 54,/*Hrkt */
  105. /* New scripts in Unicode 4.1 @stable ICU 3.4 */
  106. USCRIPT_BUGINESE = 55, /* Bugi */
  107. USCRIPT_GLAGOLITIC = 56, /* Glag */
  108. USCRIPT_KHAROSHTHI = 57, /* Khar */
  109. USCRIPT_SYLOTI_NAGRI = 58, /* Sylo */
  110. USCRIPT_NEW_TAI_LUE = 59, /* Talu */
  111. USCRIPT_TIFINAGH = 60, /* Tfng */
  112. USCRIPT_OLD_PERSIAN = 61, /* Xpeo */
  113. /* New script codes from ISO 15924 @stable ICU 3.6 */
  114. USCRIPT_BALINESE = 62, /* Bali */
  115. USCRIPT_BATAK = 63, /* Batk */
  116. USCRIPT_BLISSYMBOLS = 64, /* Blis */
  117. USCRIPT_BRAHMI = 65, /* Brah */
  118. USCRIPT_CHAM = 66, /* Cham */
  119. USCRIPT_CIRTH = 67, /* Cirt */
  120. USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC = 68, /* Cyrs */
  121. USCRIPT_DEMOTIC_EGYPTIAN = 69, /* Egyd */
  122. USCRIPT_HIERATIC_EGYPTIAN = 70, /* Egyh */
  123. USCRIPT_EGYPTIAN_HIEROGLYPHS = 71, /* Egyp */
  124. USCRIPT_KHUTSURI = 72, /* Geok */
  125. USCRIPT_SIMPLIFIED_HAN = 73, /* Hans */
  126. USCRIPT_TRADITIONAL_HAN = 74, /* Hant */
  127. USCRIPT_PAHAWH_HMONG = 75, /* Hmng */
  128. USCRIPT_OLD_HUNGARIAN = 76, /* Hung */
  129. USCRIPT_HARAPPAN_INDUS = 77, /* Inds */
  130. USCRIPT_JAVANESE = 78, /* Java */
  131. USCRIPT_KAYAH_LI = 79, /* Kali */
  132. USCRIPT_LATIN_FRAKTUR = 80, /* Latf */
  133. USCRIPT_LATIN_GAELIC = 81, /* Latg */
  134. USCRIPT_LEPCHA = 82, /* Lepc */
  135. USCRIPT_LINEAR_A = 83, /* Lina */
  136. /** @stable ICU 4.6 */
  137. USCRIPT_MANDAIC = 84, /* Mand */
  138. /** @stable ICU 3.6 */
  139. USCRIPT_MANDAEAN = USCRIPT_MANDAIC,
  140. USCRIPT_MAYAN_HIEROGLYPHS = 85, /* Maya */
  141. /** @stable ICU 4.6 */
  142. USCRIPT_MEROITIC_HIEROGLYPHS = 86, /* Mero */
  143. /** @stable ICU 3.6 */
  144. USCRIPT_MEROITIC = USCRIPT_MEROITIC_HIEROGLYPHS,
  145. USCRIPT_NKO = 87, /* Nkoo */
  146. USCRIPT_ORKHON = 88, /* Orkh */
  147. USCRIPT_OLD_PERMIC = 89, /* Perm */
  148. USCRIPT_PHAGS_PA = 90, /* Phag */
  149. USCRIPT_PHOENICIAN = 91, /* Phnx */
  150. USCRIPT_PHONETIC_POLLARD = 92, /* Plrd */
  151. USCRIPT_RONGORONGO = 93, /* Roro */
  152. USCRIPT_SARATI = 94, /* Sara */
  153. USCRIPT_ESTRANGELO_SYRIAC = 95, /* Syre */
  154. USCRIPT_WESTERN_SYRIAC = 96, /* Syrj */
  155. USCRIPT_EASTERN_SYRIAC = 97, /* Syrn */
  156. USCRIPT_TENGWAR = 98, /* Teng */
  157. USCRIPT_VAI = 99, /* Vaii */
  158. USCRIPT_VISIBLE_SPEECH = 100,/* Visp */
  159. USCRIPT_CUNEIFORM = 101,/* Xsux */
  160. USCRIPT_UNWRITTEN_LANGUAGES = 102,/* Zxxx */
  161. USCRIPT_UNKNOWN = 103,/* Zzzz */ /* Unknown="Code for uncoded script", for unassigned code points */
  162. /* New script codes from ISO 15924 @stable ICU 3.8 */
  163. USCRIPT_CARIAN = 104,/* Cari */
  164. USCRIPT_JAPANESE = 105,/* Jpan */
  165. USCRIPT_LANNA = 106,/* Lana */
  166. USCRIPT_LYCIAN = 107,/* Lyci */
  167. USCRIPT_LYDIAN = 108,/* Lydi */
  168. USCRIPT_OL_CHIKI = 109,/* Olck */
  169. USCRIPT_REJANG = 110,/* Rjng */
  170. USCRIPT_SAURASHTRA = 111,/* Saur */
  171. USCRIPT_SIGN_WRITING = 112,/* Sgnw */
  172. USCRIPT_SUNDANESE = 113,/* Sund */
  173. USCRIPT_MOON = 114,/* Moon */
  174. USCRIPT_MEITEI_MAYEK = 115,/* Mtei */
  175. /* New script codes from ISO 15924 @stable ICU 4.0 */
  176. USCRIPT_IMPERIAL_ARAMAIC = 116,/* Armi */
  177. USCRIPT_AVESTAN = 117,/* Avst */
  178. USCRIPT_CHAKMA = 118,/* Cakm */
  179. USCRIPT_KOREAN = 119,/* Kore */
  180. USCRIPT_KAITHI = 120,/* Kthi */
  181. USCRIPT_MANICHAEAN = 121,/* Mani */
  182. USCRIPT_INSCRIPTIONAL_PAHLAVI = 122,/* Phli */
  183. USCRIPT_PSALTER_PAHLAVI = 123,/* Phlp */
  184. USCRIPT_BOOK_PAHLAVI = 124,/* Phlv */
  185. USCRIPT_INSCRIPTIONAL_PARTHIAN = 125,/* Prti */
  186. USCRIPT_SAMARITAN = 126,/* Samr */
  187. USCRIPT_TAI_VIET = 127,/* Tavt */
  188. USCRIPT_MATHEMATICAL_NOTATION = 128,/* Zmth */
  189. USCRIPT_SYMBOLS = 129,/* Zsym */
  190. /* New script codes from ISO 15924 @stable ICU 4.4 */
  191. USCRIPT_BAMUM = 130,/* Bamu */
  192. USCRIPT_LISU = 131,/* Lisu */
  193. USCRIPT_NAKHI_GEBA = 132,/* Nkgb */
  194. USCRIPT_OLD_SOUTH_ARABIAN = 133,/* Sarb */
  195. /* New script codes from ISO 15924 @stable ICU 4.6 */
  196. USCRIPT_BASSA_VAH = 134,/* Bass */
  197. USCRIPT_DUPLOYAN_SHORTAND = 135,/* Dupl */
  198. USCRIPT_ELBASAN = 136,/* Elba */
  199. USCRIPT_GRANTHA = 137,/* Gran */
  200. USCRIPT_KPELLE = 138,/* Kpel */
  201. USCRIPT_LOMA = 139,/* Loma */
  202. USCRIPT_MENDE = 140,/* Mend */
  203. USCRIPT_MEROITIC_CURSIVE = 141,/* Merc */
  204. USCRIPT_OLD_NORTH_ARABIAN = 142,/* Narb */
  205. USCRIPT_NABATAEAN = 143,/* Nbat */
  206. USCRIPT_PALMYRENE = 144,/* Palm */
  207. USCRIPT_SINDHI = 145,/* Sind */
  208. USCRIPT_WARANG_CITI = 146,/* Wara */
  209. /* Private use codes from Qaaa - Qabx are not supported */
  210. USCRIPT_CODE_LIMIT = 147
  211. } UScriptCode;
  212. /**
  213. * Gets script codes associated with the given locale or ISO 15924 abbreviation or name.
  214. * Fills in USCRIPT_MALAYALAM given "Malayam" OR "Mlym".
  215. * Fills in USCRIPT_LATIN given "en" OR "en_US"
  216. * If required capacity is greater than capacity of the destination buffer then the error code
  217. * is set to U_BUFFER_OVERFLOW_ERROR and the required capacity is returned
  218. *
  219. * <p>Note: To search by short or long script alias only, use
  220. * u_getPropertyValueEnum(UCHAR_SCRIPT, alias) instead. This does
  221. * a fast lookup with no access of the locale data.
  222. * @param nameOrAbbrOrLocale name of the script, as given in
  223. * PropertyValueAliases.txt, or ISO 15924 code or locale
  224. * @param fillIn the UScriptCode buffer to fill in the script code
  225. * @param capacity the capacity (size) fo UScriptCode buffer passed in.
  226. * @param err the error status code.
  227. * @return The number of script codes filled in the buffer passed in
  228. * @stable ICU 2.4
  229. */
  230. U_STABLE int32_t U_EXPORT2
  231. uscript_getCode(const char* nameOrAbbrOrLocale,UScriptCode* fillIn,int32_t capacity,UErrorCode *err);
  232. /**
  233. * Gets a script name associated with the given script code.
  234. * Returns "Malayam" given USCRIPT_MALAYALAM
  235. * @param scriptCode UScriptCode enum
  236. * @return script long name as given in
  237. * PropertyValueAliases.txt, or NULL if scriptCode is invalid
  238. * @stable ICU 2.4
  239. */
  240. U_STABLE const char* U_EXPORT2
  241. uscript_getName(UScriptCode scriptCode);
  242. /**
  243. * Gets a script name associated with the given script code.
  244. * Returns "Mlym" given USCRIPT_MALAYALAM
  245. * @param scriptCode UScriptCode enum
  246. * @return script abbreviated name as given in
  247. * PropertyValueAliases.txt, or NULL if scriptCode is invalid
  248. * @stable ICU 2.4
  249. */
  250. U_STABLE const char* U_EXPORT2
  251. uscript_getShortName(UScriptCode scriptCode);
  252. /**
  253. * Gets the script code associated with the given codepoint.
  254. * Returns USCRIPT_MALAYALAM given 0x0D02
  255. * @param codepoint UChar32 codepoint
  256. * @param err the error status code.
  257. * @return The UScriptCode, or 0 if codepoint is invalid
  258. * @stable ICU 2.4
  259. */
  260. U_STABLE UScriptCode U_EXPORT2
  261. uscript_getScript(UChar32 codepoint, UErrorCode *err);
  262. /**
  263. * Is code point c used in script sc?
  264. * That is, does code point c have the Script property value sc,
  265. * or do code point c's Script_Extensions include script code sc?
  266. *
  267. * Some characters are commonly used in multiple scripts.
  268. * For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
  269. *
  270. * The Script_Extensions property is provisional. It may be modified or removed
  271. * in future versions of the Unicode Standard, and thus in ICU.
  272. * @param c code point
  273. * @param sc script code
  274. * @return TRUE if Script(c)==sc or sc is in Script_Extensions(c)
  275. * @draft ICU 4.6
  276. */
  277. U_DRAFT UBool U_EXPORT2
  278. uscript_hasScript(UChar32 c, UScriptCode sc);
  279. /**
  280. * Writes code point c's Script_Extensions as a list of UScriptCode values
  281. * to the output scripts array.
  282. *
  283. * Some characters are commonly used in multiple scripts.
  284. * For more information, see UAX #24: http://www.unicode.org/reports/tr24/.
  285. *
  286. * If there are more than capacity script codes to be written, then
  287. * U_BUFFER_OVERFLOW_ERROR is set and the number of Script_Extensions is returned.
  288. * (Usual ICU buffer handling behavior.)
  289. *
  290. * The Script_Extensions property is provisional. It may be modified or removed
  291. * in future versions of the Unicode Standard, and thus in ICU.
  292. * @param c code point
  293. * @param scripts output script code array
  294. * @param capacity capacity of the scripts array
  295. * @param errorCode Standard ICU error code. Its input value must
  296. * pass the U_SUCCESS() test, or else the function returns
  297. * immediately. Check for U_FAILURE() on output or use with
  298. * function chaining. (See User Guide for details.)
  299. * @return number of script codes in c's Script_Extensions,
  300. * written to scripts unless U_BUFFER_OVERFLOW_ERROR indicates insufficient capacity
  301. * @draft ICU 4.6
  302. */
  303. U_DRAFT int32_t U_EXPORT2
  304. uscript_getScriptExtensions(UChar32 c,
  305. UScriptCode *scripts, int32_t capacity,
  306. UErrorCode *pErrorCode);
  307. #endif