uscript_props.cpp 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2013-2016, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * file name: uscript_props.cpp
  9. * encoding: UTF-8
  10. * tab size: 8 (not used)
  11. * indentation:4
  12. *
  13. * created on: 2013feb16
  14. * created by: Markus W. Scherer
  15. */
  16. #include "unicode/utypes.h"
  17. #include "unicode/unistr.h"
  18. #include "unicode/uscript.h"
  19. #include "unicode/utf16.h"
  20. #include "ustr_imp.h"
  21. #include "cmemory.h"
  22. namespace {
  23. // Script metadata (script properties).
  24. // See http://unicode.org/cldr/trac/browser/trunk/common/properties/scriptMetadata.txt
  25. // 0 = NOT_ENCODED, no sample character, default false script properties.
  26. // Bits 20.. 0: sample character
  27. // Bits 23..21: usage
  28. const int32_t UNKNOWN = 1 << 21;
  29. const int32_t EXCLUSION = 2 << 21;
  30. const int32_t LIMITED_USE = 3 << 21;
  31. // st int32_t ASPIRATIONAL = 4 << 21; -- not used any more since Unicode 10
  32. const int32_t RECOMMENDED = 5 << 21;
  33. // Bits 31..24: Single-bit flags
  34. const int32_t RTL = 1 << 24;
  35. const int32_t LB_LETTERS = 1 << 25;
  36. const int32_t CASED = 1 << 26;
  37. const int32_t SCRIPT_PROPS[] = {
  38. // Begin copy-paste output from
  39. // tools/trunk/unicode/py/parsescriptmetadata.py
  40. 0x0040 | RECOMMENDED, // Zyyy
  41. 0x0308 | RECOMMENDED, // Zinh
  42. 0x0628 | RECOMMENDED | RTL, // Arab
  43. 0x0531 | RECOMMENDED | CASED, // Armn
  44. 0x0995 | RECOMMENDED, // Beng
  45. 0x3105 | RECOMMENDED | LB_LETTERS, // Bopo
  46. 0x13C4 | LIMITED_USE | CASED, // Cher
  47. 0x03E2 | EXCLUSION | CASED, // Copt
  48. 0x042F | RECOMMENDED | CASED, // Cyrl
  49. 0x10414 | EXCLUSION | CASED, // Dsrt
  50. 0x0905 | RECOMMENDED, // Deva
  51. 0x12A0 | RECOMMENDED, // Ethi
  52. 0x10D3 | RECOMMENDED, // Geor
  53. 0x10330 | EXCLUSION, // Goth
  54. 0x03A9 | RECOMMENDED | CASED, // Grek
  55. 0x0A95 | RECOMMENDED, // Gujr
  56. 0x0A15 | RECOMMENDED, // Guru
  57. 0x5B57 | RECOMMENDED | LB_LETTERS, // Hani
  58. 0xAC00 | RECOMMENDED, // Hang
  59. 0x05D0 | RECOMMENDED | RTL, // Hebr
  60. 0x304B | RECOMMENDED | LB_LETTERS, // Hira
  61. 0x0C95 | RECOMMENDED, // Knda
  62. 0x30AB | RECOMMENDED | LB_LETTERS, // Kana
  63. 0x1780 | RECOMMENDED | LB_LETTERS, // Khmr
  64. 0x0EA5 | RECOMMENDED | LB_LETTERS, // Laoo
  65. 0x004C | RECOMMENDED | CASED, // Latn
  66. 0x0D15 | RECOMMENDED, // Mlym
  67. 0x1826 | EXCLUSION, // Mong
  68. 0x1000 | RECOMMENDED | LB_LETTERS, // Mymr
  69. 0x168F | EXCLUSION, // Ogam
  70. 0x10300 | EXCLUSION, // Ital
  71. 0x0B15 | RECOMMENDED, // Orya
  72. 0x16A0 | EXCLUSION, // Runr
  73. 0x0D85 | RECOMMENDED, // Sinh
  74. 0x0710 | LIMITED_USE | RTL, // Syrc
  75. 0x0B95 | RECOMMENDED, // Taml
  76. 0x0C15 | RECOMMENDED, // Telu
  77. 0x078C | RECOMMENDED | RTL, // Thaa
  78. 0x0E17 | RECOMMENDED | LB_LETTERS, // Thai
  79. 0x0F40 | RECOMMENDED, // Tibt
  80. 0x14C0 | LIMITED_USE, // Cans
  81. 0xA288 | LIMITED_USE | LB_LETTERS, // Yiii
  82. 0x1703 | EXCLUSION, // Tglg
  83. 0x1723 | EXCLUSION, // Hano
  84. 0x1743 | EXCLUSION, // Buhd
  85. 0x1763 | EXCLUSION, // Tagb
  86. 0x280E | UNKNOWN, // Brai
  87. 0x10800 | EXCLUSION | RTL, // Cprt
  88. 0x1900 | LIMITED_USE, // Limb
  89. 0x10000 | EXCLUSION, // Linb
  90. 0x10480 | EXCLUSION, // Osma
  91. 0x10450 | EXCLUSION, // Shaw
  92. 0x1950 | LIMITED_USE | LB_LETTERS, // Tale
  93. 0x10380 | EXCLUSION, // Ugar
  94. 0,
  95. 0x1A00 | EXCLUSION, // Bugi
  96. 0x2C00 | EXCLUSION | CASED, // Glag
  97. 0x10A00 | EXCLUSION | RTL, // Khar
  98. 0xA800 | LIMITED_USE, // Sylo
  99. 0x1980 | LIMITED_USE | LB_LETTERS, // Talu
  100. 0x2D30 | LIMITED_USE, // Tfng
  101. 0x103A0 | EXCLUSION, // Xpeo
  102. 0x1B05 | LIMITED_USE, // Bali
  103. 0x1BC0 | LIMITED_USE, // Batk
  104. 0,
  105. 0x11005 | EXCLUSION, // Brah
  106. 0xAA00 | LIMITED_USE, // Cham
  107. 0,
  108. 0,
  109. 0,
  110. 0,
  111. 0x13153 | EXCLUSION, // Egyp
  112. 0,
  113. 0x5B57 | RECOMMENDED | LB_LETTERS, // Hans
  114. 0x5B57 | RECOMMENDED | LB_LETTERS, // Hant
  115. 0x16B1C | EXCLUSION, // Hmng
  116. 0x10CA1 | EXCLUSION | RTL | CASED, // Hung
  117. 0,
  118. 0xA984 | LIMITED_USE, // Java
  119. 0xA90A | LIMITED_USE, // Kali
  120. 0,
  121. 0,
  122. 0x1C00 | LIMITED_USE, // Lepc
  123. 0x10647 | EXCLUSION, // Lina
  124. 0x0840 | LIMITED_USE | RTL, // Mand
  125. 0,
  126. 0x10980 | EXCLUSION | RTL, // Mero
  127. 0x07CA | LIMITED_USE | RTL, // Nkoo
  128. 0x10C00 | EXCLUSION | RTL, // Orkh
  129. 0x1036B | EXCLUSION, // Perm
  130. 0xA840 | EXCLUSION, // Phag
  131. 0x10900 | EXCLUSION | RTL, // Phnx
  132. 0x16F00 | LIMITED_USE, // Plrd
  133. 0,
  134. 0,
  135. 0,
  136. 0,
  137. 0,
  138. 0,
  139. 0xA549 | LIMITED_USE, // Vaii
  140. 0,
  141. 0x12000 | EXCLUSION, // Xsux
  142. 0,
  143. 0xFDD0 | UNKNOWN, // Zzzz
  144. 0x102A0 | EXCLUSION, // Cari
  145. 0x304B | RECOMMENDED | LB_LETTERS, // Jpan
  146. 0x1A20 | LIMITED_USE | LB_LETTERS, // Lana
  147. 0x10280 | EXCLUSION, // Lyci
  148. 0x10920 | EXCLUSION | RTL, // Lydi
  149. 0x1C5A | LIMITED_USE, // Olck
  150. 0xA930 | EXCLUSION, // Rjng
  151. 0xA882 | LIMITED_USE, // Saur
  152. 0x1D850 | EXCLUSION, // Sgnw
  153. 0x1B83 | LIMITED_USE, // Sund
  154. 0,
  155. 0xABC0 | LIMITED_USE, // Mtei
  156. 0x10840 | EXCLUSION | RTL, // Armi
  157. 0x10B00 | EXCLUSION | RTL, // Avst
  158. 0x11103 | LIMITED_USE, // Cakm
  159. 0xAC00 | RECOMMENDED, // Kore
  160. 0x11083 | EXCLUSION, // Kthi
  161. 0x10AD8 | EXCLUSION | RTL, // Mani
  162. 0x10B60 | EXCLUSION | RTL, // Phli
  163. 0x10B8F | EXCLUSION | RTL, // Phlp
  164. 0,
  165. 0x10B40 | EXCLUSION | RTL, // Prti
  166. 0x0800 | EXCLUSION | RTL, // Samr
  167. 0xAA80 | LIMITED_USE | LB_LETTERS, // Tavt
  168. 0,
  169. 0,
  170. 0xA6A0 | LIMITED_USE, // Bamu
  171. 0xA4D0 | LIMITED_USE, // Lisu
  172. 0,
  173. 0x10A60 | EXCLUSION | RTL, // Sarb
  174. 0x16AE6 | EXCLUSION, // Bass
  175. 0x1BC20 | EXCLUSION, // Dupl
  176. 0x10500 | EXCLUSION, // Elba
  177. 0x11315 | EXCLUSION, // Gran
  178. 0,
  179. 0,
  180. 0x1E802 | EXCLUSION | RTL, // Mend
  181. 0x109A0 | EXCLUSION | RTL, // Merc
  182. 0x10A95 | EXCLUSION | RTL, // Narb
  183. 0x10896 | EXCLUSION | RTL, // Nbat
  184. 0x10873 | EXCLUSION | RTL, // Palm
  185. 0x112BE | EXCLUSION, // Sind
  186. 0x118B4 | EXCLUSION | CASED, // Wara
  187. 0,
  188. 0,
  189. 0x16A4F | EXCLUSION, // Mroo
  190. 0x1B1C4 | EXCLUSION | LB_LETTERS, // Nshu
  191. 0x11183 | EXCLUSION, // Shrd
  192. 0x110D0 | EXCLUSION, // Sora
  193. 0x11680 | EXCLUSION, // Takr
  194. 0x18229 | EXCLUSION | LB_LETTERS, // Tang
  195. 0,
  196. 0x14400 | EXCLUSION, // Hluw
  197. 0x11208 | EXCLUSION, // Khoj
  198. 0x11484 | EXCLUSION, // Tirh
  199. 0x10537 | EXCLUSION, // Aghb
  200. 0x11152 | EXCLUSION, // Mahj
  201. 0x11717 | EXCLUSION | LB_LETTERS, // Ahom
  202. 0x108F4 | EXCLUSION | RTL, // Hatr
  203. 0x1160E | EXCLUSION, // Modi
  204. 0x1128F | EXCLUSION, // Mult
  205. 0x11AC0 | EXCLUSION, // Pauc
  206. 0x1158E | EXCLUSION, // Sidd
  207. 0x1E909 | LIMITED_USE | RTL | CASED, // Adlm
  208. 0x11C0E | EXCLUSION, // Bhks
  209. 0x11C72 | EXCLUSION, // Marc
  210. 0x11412 | LIMITED_USE, // Newa
  211. 0x104B5 | LIMITED_USE | CASED, // Osge
  212. 0x5B57 | RECOMMENDED | LB_LETTERS, // Hanb
  213. 0x1112 | RECOMMENDED, // Jamo
  214. 0,
  215. 0x11D10 | EXCLUSION, // Gonm
  216. 0x11A5C | EXCLUSION, // Soyo
  217. 0x11A0B | EXCLUSION, // Zanb
  218. 0x1180B | EXCLUSION, // Dogr
  219. 0x11D71 | LIMITED_USE, // Gong
  220. 0x11EE5 | EXCLUSION, // Maka
  221. 0x16E40 | EXCLUSION | CASED, // Medf
  222. 0x10D12 | LIMITED_USE | RTL, // Rohg
  223. 0x10F42 | EXCLUSION | RTL, // Sogd
  224. 0x10F19 | EXCLUSION | RTL, // Sogo
  225. 0x10FF1 | EXCLUSION | RTL, // Elym
  226. 0x1E108 | LIMITED_USE, // Hmnp
  227. 0x119CE | EXCLUSION, // Nand
  228. 0x1E2E1 | LIMITED_USE, // Wcho
  229. 0x10FBF | EXCLUSION | RTL, // Chrs
  230. 0x1190C | EXCLUSION, // Diak
  231. 0x18C65 | EXCLUSION | LB_LETTERS, // Kits
  232. 0x10E88 | EXCLUSION | RTL, // Yezi
  233. 0x12FE5 | EXCLUSION, // Cpmn
  234. 0x10F7C | EXCLUSION | RTL, // Ougr
  235. 0x16ABC | EXCLUSION, // Tnsa
  236. 0x1E290 | EXCLUSION, // Toto
  237. 0x10582 | EXCLUSION | CASED, // Vith
  238. 0x11F1B | EXCLUSION | LB_LETTERS, // Kawi
  239. 0x1E4E6 | EXCLUSION, // Nagm
  240. // End copy-paste from parsescriptmetadata.py
  241. };
  242. int32_t getScriptProps(UScriptCode script) {
  243. if (0 <= script && script < UPRV_LENGTHOF(SCRIPT_PROPS)) {
  244. return SCRIPT_PROPS[script];
  245. } else {
  246. return 0;
  247. }
  248. }
  249. } // namespace
  250. U_CAPI int32_t U_EXPORT2
  251. uscript_getSampleString(UScriptCode script, char16_t *dest, int32_t capacity, UErrorCode *pErrorCode) {
  252. if(U_FAILURE(*pErrorCode)) { return 0; }
  253. if(capacity < 0 || (capacity > 0 && dest == nullptr)) {
  254. *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  255. return 0;
  256. }
  257. int32_t sampleChar = getScriptProps(script) & 0x1fffff;
  258. int32_t length;
  259. if(sampleChar == 0) {
  260. length = 0;
  261. } else {
  262. length = U16_LENGTH(sampleChar);
  263. if(length <= capacity) {
  264. int32_t i = 0;
  265. U16_APPEND_UNSAFE(dest, i, sampleChar);
  266. }
  267. }
  268. return u_terminateUChars(dest, capacity, length, pErrorCode);
  269. }
  270. U_COMMON_API icu::UnicodeString U_EXPORT2
  271. uscript_getSampleUnicodeString(UScriptCode script) {
  272. icu::UnicodeString sample;
  273. int32_t sampleChar = getScriptProps(script) & 0x1fffff;
  274. if(sampleChar != 0) {
  275. sample.append(sampleChar);
  276. }
  277. return sample;
  278. }
  279. U_CAPI UScriptUsage U_EXPORT2
  280. uscript_getUsage(UScriptCode script) {
  281. return (UScriptUsage)((getScriptProps(script) >> 21) & 7);
  282. }
  283. U_CAPI UBool U_EXPORT2
  284. uscript_isRightToLeft(UScriptCode script) {
  285. return (getScriptProps(script) & RTL) != 0;
  286. }
  287. U_CAPI UBool U_EXPORT2
  288. uscript_breaksBetweenLetters(UScriptCode script) {
  289. return (getScriptProps(script) & LB_LETTERS) != 0;
  290. }
  291. U_CAPI UBool U_EXPORT2
  292. uscript_isCased(UScriptCode script) {
  293. return (getScriptProps(script) & CASED) != 0;
  294. }