unicode.nim 59 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module provides support to handle the Unicode UTF-8 encoding.
  10. {.deadCodeElim: on.} # dce option deprecated
  11. include "system/inclrtl"
  12. type
  13. RuneImpl = int32 # underlying type of Rune
  14. Rune* = distinct RuneImpl ## type that can hold any Unicode character
  15. Rune16* = distinct int16 ## 16 bit Unicode character
  16. proc `<=%`*(a, b: Rune): bool = return int(a) <=% int(b)
  17. proc `<%`*(a, b: Rune): bool = return int(a) <% int(b)
  18. proc `==`*(a, b: Rune): bool = return int(a) == int(b)
  19. template ones(n: untyped): untyped = ((1 shl n)-1)
  20. proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} =
  21. ## Returns the number of Unicode characters of the string ``s``
  22. var i = 0
  23. while i < len(s):
  24. if ord(s[i]) <=% 127: inc(i)
  25. elif ord(s[i]) shr 5 == 0b110: inc(i, 2)
  26. elif ord(s[i]) shr 4 == 0b1110: inc(i, 3)
  27. elif ord(s[i]) shr 3 == 0b11110: inc(i, 4)
  28. elif ord(s[i]) shr 2 == 0b111110: inc(i, 5)
  29. elif ord(s[i]) shr 1 == 0b1111110: inc(i, 6)
  30. else: inc i
  31. inc(result)
  32. proc runeLenAt*(s: string, i: Natural): int =
  33. ## Returns the number of bytes the rune starting at ``s[i]`` takes
  34. if ord(s[i]) <=% 127: result = 1
  35. elif ord(s[i]) shr 5 == 0b110: result = 2
  36. elif ord(s[i]) shr 4 == 0b1110: result = 3
  37. elif ord(s[i]) shr 3 == 0b11110: result = 4
  38. elif ord(s[i]) shr 2 == 0b111110: result = 5
  39. elif ord(s[i]) shr 1 == 0b1111110: result = 6
  40. else: result = 1
  41. const replRune = Rune(0xFFFD)
  42. template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) =
  43. ## Returns the Unicode character ``s[i]`` in ``result``. If ``doInc == true``
  44. ## ``i`` is incremented by the number of bytes that have been processed.
  45. bind ones
  46. if ord(s[i]) <=% 127:
  47. result = Rune(ord(s[i]))
  48. when doInc: inc(i)
  49. elif ord(s[i]) shr 5 == 0b110:
  50. # assert(ord(s[i+1]) shr 6 == 0b10)
  51. if i <= s.len - 2:
  52. result = Rune((ord(s[i]) and (ones(5))) shl 6 or
  53. (ord(s[i+1]) and ones(6)))
  54. when doInc: inc(i, 2)
  55. else:
  56. result = replRune
  57. when doInc: inc(i)
  58. elif ord(s[i]) shr 4 == 0b1110:
  59. # assert(ord(s[i+1]) shr 6 == 0b10)
  60. # assert(ord(s[i+2]) shr 6 == 0b10)
  61. if i <= s.len - 3:
  62. result = Rune((ord(s[i]) and ones(4)) shl 12 or
  63. (ord(s[i+1]) and ones(6)) shl 6 or
  64. (ord(s[i+2]) and ones(6)))
  65. when doInc: inc(i, 3)
  66. else:
  67. result = replRune
  68. when doInc: inc(i)
  69. elif ord(s[i]) shr 3 == 0b11110:
  70. # assert(ord(s[i+1]) shr 6 == 0b10)
  71. # assert(ord(s[i+2]) shr 6 == 0b10)
  72. # assert(ord(s[i+3]) shr 6 == 0b10)
  73. if i <= s.len - 4:
  74. result = Rune((ord(s[i]) and ones(3)) shl 18 or
  75. (ord(s[i+1]) and ones(6)) shl 12 or
  76. (ord(s[i+2]) and ones(6)) shl 6 or
  77. (ord(s[i+3]) and ones(6)))
  78. when doInc: inc(i, 4)
  79. else:
  80. result = replRune
  81. when doInc: inc(i)
  82. elif ord(s[i]) shr 2 == 0b111110:
  83. # assert(ord(s[i+1]) shr 6 == 0b10)
  84. # assert(ord(s[i+2]) shr 6 == 0b10)
  85. # assert(ord(s[i+3]) shr 6 == 0b10)
  86. # assert(ord(s[i+4]) shr 6 == 0b10)
  87. if i <= s.len - 5:
  88. result = Rune((ord(s[i]) and ones(2)) shl 24 or
  89. (ord(s[i+1]) and ones(6)) shl 18 or
  90. (ord(s[i+2]) and ones(6)) shl 12 or
  91. (ord(s[i+3]) and ones(6)) shl 6 or
  92. (ord(s[i+4]) and ones(6)))
  93. when doInc: inc(i, 5)
  94. else:
  95. result = replRune
  96. when doInc: inc(i)
  97. elif ord(s[i]) shr 1 == 0b1111110:
  98. # assert(ord(s[i+1]) shr 6 == 0b10)
  99. # assert(ord(s[i+2]) shr 6 == 0b10)
  100. # assert(ord(s[i+3]) shr 6 == 0b10)
  101. # assert(ord(s[i+4]) shr 6 == 0b10)
  102. # assert(ord(s[i+5]) shr 6 == 0b10)
  103. if i <= s.len - 6:
  104. result = Rune((ord(s[i]) and ones(1)) shl 30 or
  105. (ord(s[i+1]) and ones(6)) shl 24 or
  106. (ord(s[i+2]) and ones(6)) shl 18 or
  107. (ord(s[i+3]) and ones(6)) shl 12 or
  108. (ord(s[i+4]) and ones(6)) shl 6 or
  109. (ord(s[i+5]) and ones(6)))
  110. when doInc: inc(i, 6)
  111. else:
  112. result = replRune
  113. when doInc: inc(i)
  114. else:
  115. result = Rune(ord(s[i]))
  116. when doInc: inc(i)
  117. proc validateUtf8*(s: string): int =
  118. ## Returns the position of the invalid byte in ``s`` if the string ``s`` does
  119. ## not hold valid UTF-8 data. Otherwise ``-1`` is returned.
  120. var i = 0
  121. let L = s.len
  122. while i < L:
  123. if ord(s[i]) <=% 127:
  124. inc(i)
  125. elif ord(s[i]) shr 5 == 0b110:
  126. if ord(s[i]) < 0xc2: return i # Catch overlong ascii representations.
  127. if i+1 < L and ord(s[i+1]) shr 6 == 0b10: inc(i, 2)
  128. else: return i
  129. elif ord(s[i]) shr 4 == 0b1110:
  130. if i+2 < L and ord(s[i+1]) shr 6 == 0b10 and ord(s[i+2]) shr 6 == 0b10:
  131. inc i, 3
  132. else: return i
  133. elif ord(s[i]) shr 3 == 0b11110:
  134. if i+3 < L and ord(s[i+1]) shr 6 == 0b10 and
  135. ord(s[i+2]) shr 6 == 0b10 and
  136. ord(s[i+3]) shr 6 == 0b10:
  137. inc i, 4
  138. else: return i
  139. else:
  140. return i
  141. return -1
  142. proc runeAt*(s: string, i: Natural): Rune =
  143. ## Returns the unicode character in ``s`` at byte index ``i``
  144. fastRuneAt(s, i, result, false)
  145. template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) =
  146. ## Copies UTF-8 representation of `c` into the preallocated string `s`
  147. ## starting at position `pos`. If `doInc == true`, `pos` is incremented
  148. ## by the number of bytes that have been processed.
  149. ##
  150. ## To be the most efficient, make sure `s` is preallocated
  151. ## with an additional amount equal to the byte length of
  152. ## `c`.
  153. var i = RuneImpl(c)
  154. if i <=% 127:
  155. s.setLen(pos+1)
  156. s[pos+0] = chr(i)
  157. when doInc: inc(pos)
  158. elif i <=% 0x07FF:
  159. s.setLen(pos+2)
  160. s[pos+0] = chr((i shr 6) or 0b110_00000)
  161. s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
  162. when doInc: inc(pos, 2)
  163. elif i <=% 0xFFFF:
  164. s.setLen(pos+3)
  165. s[pos+0] = chr(i shr 12 or 0b1110_0000)
  166. s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  167. s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
  168. when doInc: inc(pos, 3)
  169. elif i <=% 0x001FFFFF:
  170. s.setLen(pos+4)
  171. s[pos+0] = chr(i shr 18 or 0b1111_0000)
  172. s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  173. s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  174. s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
  175. when doInc: inc(pos, 4)
  176. elif i <=% 0x03FFFFFF:
  177. s.setLen(pos+5)
  178. s[pos+0] = chr(i shr 24 or 0b111110_00)
  179. s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  180. s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  181. s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  182. s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
  183. when doInc: inc(pos, 5)
  184. elif i <=% 0x7FFFFFFF:
  185. s.setLen(pos+6)
  186. s[pos+0] = chr(i shr 30 or 0b1111110_0)
  187. s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
  188. s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  189. s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  190. s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  191. s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
  192. when doInc: inc(pos, 6)
  193. else:
  194. discard # error, exception?
  195. proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} =
  196. ## Converts a rune into its UTF-8 representation
  197. result = ""
  198. fastToUTF8Copy(c, result, 0, false)
  199. proc add*(s: var string; c: Rune) =
  200. let pos = s.len
  201. fastToUTF8Copy(c, s, pos, false)
  202. proc `$`*(rune: Rune): string =
  203. ## Converts a Rune to a string
  204. rune.toUTF8
  205. proc `$`*(runes: seq[Rune]): string =
  206. ## Converts a sequence of Runes to a string
  207. result = ""
  208. for rune in runes:
  209. result.add rune
  210. proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int =
  211. ## Returns the byte position of unicode character
  212. ## at position pos in s with an optional start byte position.
  213. ## returns the special value -1 if it runs out of the string
  214. ##
  215. ## Beware: This can lead to unoptimized code and slow execution!
  216. ## Most problems can be solved more efficiently by using an iterator
  217. ## or conversion to a seq of Rune.
  218. var
  219. i = 0
  220. o = start
  221. while i < pos:
  222. o += runeLenAt(s, o)
  223. if o >= s.len:
  224. return -1
  225. inc i
  226. return o
  227. proc runeAtPos*(s: string, pos: int): Rune =
  228. ## Returns the unicode character at position pos
  229. ##
  230. ## Beware: This can lead to unoptimized code and slow execution!
  231. ## Most problems can be solved more efficiently by using an iterator
  232. ## or conversion to a seq of Rune.
  233. fastRuneAt(s, runeOffset(s, pos), result, false)
  234. proc runeStrAtPos*(s: string, pos: Natural): string =
  235. ## Returns the unicode character at position pos as UTF8 String
  236. ##
  237. ## Beware: This can lead to unoptimized code and slow execution!
  238. ## Most problems can be solved more efficiently by using an iterator
  239. ## or conversion to a seq of Rune.
  240. let o = runeOffset(s, pos)
  241. s[o.. (o+runeLenAt(s, o)-1)]
  242. proc runeReverseOffset*(s: string, rev:Positive): (int, int) =
  243. ## Returns a tuple with the the byte offset of the
  244. ## unicode character at position ``rev`` in s counting
  245. ## from the end (starting with 1) and the total
  246. ## number of runes in the string. Returns a negative value
  247. ## for offset if there are to few runes in the string to
  248. ## satisfy the request.
  249. ##
  250. ## Beware: This can lead to unoptimized code and slow execution!
  251. ## Most problems can be solved more efficiently by using an iterator
  252. ## or conversion to a seq of Rune.
  253. var
  254. a = rev.int
  255. o = 0
  256. x = 0
  257. while o < s.len:
  258. let r = runeLenAt(s, o)
  259. o += r
  260. if a < 0:
  261. x += r
  262. dec a
  263. if a > 0:
  264. return (-a, rev.int-a)
  265. return (x, -a+rev.int)
  266. proc runeSubStr*(s: string, pos:int, len:int = int.high): string =
  267. ## Returns the UTF-8 substring starting at codepoint pos
  268. ## with len codepoints. If pos or len is negative they count from
  269. ## the end of the string. If len is not given it means the longest
  270. ## possible string.
  271. ##
  272. ## (Needs some examples)
  273. if pos < 0:
  274. let (o, rl) = runeReverseOffset(s, -pos)
  275. if len >= rl:
  276. result = s.substr(o, s.len-1)
  277. elif len < 0:
  278. let e = rl + len
  279. if e < 0:
  280. result = ""
  281. else:
  282. result = s.substr(o, runeOffset(s, e-(rl+pos) , o)-1)
  283. else:
  284. result = s.substr(o, runeOffset(s, len, o)-1)
  285. else:
  286. let o = runeOffset(s, pos)
  287. if o < 0:
  288. result = ""
  289. elif len == int.high:
  290. result = s.substr(o, s.len-1)
  291. elif len < 0:
  292. let (e, rl) = runeReverseOffset(s, -len)
  293. discard rl
  294. if e <= 0:
  295. result = ""
  296. else:
  297. result = s.substr(o, e-1)
  298. else:
  299. var e = runeOffset(s, len, o)
  300. if e < 0:
  301. e = s.len
  302. result = s.substr(o, e-1)
  303. const
  304. alphaRanges = [
  305. 0x00d8, 0x00f6, # -
  306. 0x00f8, 0x01f5, # -
  307. 0x0250, 0x02a8, # -
  308. 0x038e, 0x03a1, # -
  309. 0x03a3, 0x03ce, # -
  310. 0x03d0, 0x03d6, # -
  311. 0x03e2, 0x03f3, # -
  312. 0x0490, 0x04c4, # -
  313. 0x0561, 0x0587, # -
  314. 0x05d0, 0x05ea, # -
  315. 0x05f0, 0x05f2, # -
  316. 0x0621, 0x063a, # -
  317. 0x0640, 0x064a, # -
  318. 0x0671, 0x06b7, # -
  319. 0x06ba, 0x06be, # -
  320. 0x06c0, 0x06ce, # -
  321. 0x06d0, 0x06d3, # -
  322. 0x0905, 0x0939, # -
  323. 0x0958, 0x0961, # -
  324. 0x0985, 0x098c, # -
  325. 0x098f, 0x0990, # -
  326. 0x0993, 0x09a8, # -
  327. 0x09aa, 0x09b0, # -
  328. 0x09b6, 0x09b9, # -
  329. 0x09dc, 0x09dd, # -
  330. 0x09df, 0x09e1, # -
  331. 0x09f0, 0x09f1, # -
  332. 0x0a05, 0x0a0a, # -
  333. 0x0a0f, 0x0a10, # -
  334. 0x0a13, 0x0a28, # -
  335. 0x0a2a, 0x0a30, # -
  336. 0x0a32, 0x0a33, # -
  337. 0x0a35, 0x0a36, # -
  338. 0x0a38, 0x0a39, # -
  339. 0x0a59, 0x0a5c, # -
  340. 0x0a85, 0x0a8b, # -
  341. 0x0a8f, 0x0a91, # -
  342. 0x0a93, 0x0aa8, # -
  343. 0x0aaa, 0x0ab0, # -
  344. 0x0ab2, 0x0ab3, # -
  345. 0x0ab5, 0x0ab9, # -
  346. 0x0b05, 0x0b0c, # -
  347. 0x0b0f, 0x0b10, # -
  348. 0x0b13, 0x0b28, # -
  349. 0x0b2a, 0x0b30, # -
  350. 0x0b32, 0x0b33, # -
  351. 0x0b36, 0x0b39, # -
  352. 0x0b5c, 0x0b5d, # -
  353. 0x0b5f, 0x0b61, # -
  354. 0x0b85, 0x0b8a, # -
  355. 0x0b8e, 0x0b90, # -
  356. 0x0b92, 0x0b95, # -
  357. 0x0b99, 0x0b9a, # -
  358. 0x0b9e, 0x0b9f, # -
  359. 0x0ba3, 0x0ba4, # -
  360. 0x0ba8, 0x0baa, # -
  361. 0x0bae, 0x0bb5, # -
  362. 0x0bb7, 0x0bb9, # -
  363. 0x0c05, 0x0c0c, # -
  364. 0x0c0e, 0x0c10, # -
  365. 0x0c12, 0x0c28, # -
  366. 0x0c2a, 0x0c33, # -
  367. 0x0c35, 0x0c39, # -
  368. 0x0c60, 0x0c61, # -
  369. 0x0c85, 0x0c8c, # -
  370. 0x0c8e, 0x0c90, # -
  371. 0x0c92, 0x0ca8, # -
  372. 0x0caa, 0x0cb3, # -
  373. 0x0cb5, 0x0cb9, # -
  374. 0x0ce0, 0x0ce1, # -
  375. 0x0d05, 0x0d0c, # -
  376. 0x0d0e, 0x0d10, # -
  377. 0x0d12, 0x0d28, # -
  378. 0x0d2a, 0x0d39, # -
  379. 0x0d60, 0x0d61, # -
  380. 0x0e01, 0x0e30, # -
  381. 0x0e32, 0x0e33, # -
  382. 0x0e40, 0x0e46, # -
  383. 0x0e5a, 0x0e5b, # -
  384. 0x0e81, 0x0e82, # -
  385. 0x0e87, 0x0e88, # -
  386. 0x0e94, 0x0e97, # -
  387. 0x0e99, 0x0e9f, # -
  388. 0x0ea1, 0x0ea3, # -
  389. 0x0eaa, 0x0eab, # -
  390. 0x0ead, 0x0eae, # -
  391. 0x0eb2, 0x0eb3, # -
  392. 0x0ec0, 0x0ec4, # -
  393. 0x0edc, 0x0edd, # -
  394. 0x0f18, 0x0f19, # -
  395. 0x0f40, 0x0f47, # -
  396. 0x0f49, 0x0f69, # -
  397. 0x10d0, 0x10f6, # -
  398. 0x1100, 0x1159, # -
  399. 0x115f, 0x11a2, # -
  400. 0x11a8, 0x11f9, # -
  401. 0x1e00, 0x1e9b, # -
  402. 0x1f50, 0x1f57, # -
  403. 0x1f80, 0x1fb4, # -
  404. 0x1fb6, 0x1fbc, # -
  405. 0x1fc2, 0x1fc4, # -
  406. 0x1fc6, 0x1fcc, # -
  407. 0x1fd0, 0x1fd3, # -
  408. 0x1fd6, 0x1fdb, # -
  409. 0x1fe0, 0x1fec, # -
  410. 0x1ff2, 0x1ff4, # -
  411. 0x1ff6, 0x1ffc, # -
  412. 0x210a, 0x2113, # -
  413. 0x2115, 0x211d, # -
  414. 0x2120, 0x2122, # -
  415. 0x212a, 0x2131, # -
  416. 0x2133, 0x2138, # -
  417. 0x3041, 0x3094, # -
  418. 0x30a1, 0x30fa, # -
  419. 0x3105, 0x312c, # -
  420. 0x3131, 0x318e, # -
  421. 0x3192, 0x319f, # -
  422. 0x3260, 0x327b, # -
  423. 0x328a, 0x32b0, # -
  424. 0x32d0, 0x32fe, # -
  425. 0x3300, 0x3357, # -
  426. 0x3371, 0x3376, # -
  427. 0x337b, 0x3394, # -
  428. 0x3399, 0x339e, # -
  429. 0x33a9, 0x33ad, # -
  430. 0x33b0, 0x33c1, # -
  431. 0x33c3, 0x33c5, # -
  432. 0x33c7, 0x33d7, # -
  433. 0x33d9, 0x33dd, # -
  434. 0x4e00, 0x9fff, # -
  435. 0xac00, 0xd7a3, # -
  436. 0xf900, 0xfb06, # -
  437. 0xfb13, 0xfb17, # -
  438. 0xfb1f, 0xfb28, # -
  439. 0xfb2a, 0xfb36, # -
  440. 0xfb38, 0xfb3c, # -
  441. 0xfb40, 0xfb41, # -
  442. 0xfb43, 0xfb44, # -
  443. 0xfb46, 0xfbb1, # -
  444. 0xfbd3, 0xfd3d, # -
  445. 0xfd50, 0xfd8f, # -
  446. 0xfd92, 0xfdc7, # -
  447. 0xfdf0, 0xfdf9, # -
  448. 0xfe70, 0xfe72, # -
  449. 0xfe76, 0xfefc, # -
  450. 0xff66, 0xff6f, # -
  451. 0xff71, 0xff9d, # -
  452. 0xffa0, 0xffbe, # -
  453. 0xffc2, 0xffc7, # -
  454. 0xffca, 0xffcf, # -
  455. 0xffd2, 0xffd7, # -
  456. 0xffda, 0xffdc] # -
  457. alphaSinglets = [
  458. 0x00aa, #
  459. 0x00b5, #
  460. 0x00ba, #
  461. 0x03da, #
  462. 0x03dc, #
  463. 0x03de, #
  464. 0x03e0, #
  465. 0x06d5, #
  466. 0x09b2, #
  467. 0x0a5e, #
  468. 0x0a8d, #
  469. 0x0ae0, #
  470. 0x0b9c, #
  471. 0x0cde, #
  472. 0x0e4f, #
  473. 0x0e84, #
  474. 0x0e8a, #
  475. 0x0e8d, #
  476. 0x0ea5, #
  477. 0x0ea7, #
  478. 0x0eb0, #
  479. 0x0ebd, #
  480. 0x1fbe, #
  481. 0x207f, #
  482. 0x20a8, #
  483. 0x2102, #
  484. 0x2107, #
  485. 0x2124, #
  486. 0x2126, #
  487. 0x2128, #
  488. 0xfb3e, #
  489. 0xfe74] #
  490. spaceRanges = [
  491. 0x0009, 0x000d, # tab and newline
  492. 0x0020, 0x0020, # space
  493. 0x0085, 0x0085, # next line
  494. 0x00a0, 0x00a0, #
  495. 0x1680, 0x1680, # Ogham space mark
  496. 0x2000, 0x200b, # en dash .. zero-width space
  497. 0x200e, 0x200f, # LTR mark .. RTL mark (pattern whitespace)
  498. 0x2028, 0x2029, # - 0x3000, 0x3000, #
  499. 0x202f, 0x202f, # narrow no-break space
  500. 0x205f, 0x205f, # medium mathematical space
  501. 0x3000, 0x3000, # ideographic space
  502. 0xfeff, 0xfeff] #
  503. unicodeSpaces = [
  504. Rune 0x0009, # tab
  505. Rune 0x000a, # LF
  506. Rune 0x000d, # CR
  507. Rune 0x0020, # space
  508. Rune 0x0085, # next line
  509. Rune 0x00a0, # unknown
  510. Rune 0x1680, # Ogham space mark
  511. Rune 0x2000, # en dash .. zero-width space
  512. Rune 0x200e, Rune 0x200f, # LTR mark .. RTL mark (pattern whitespace)
  513. Rune 0x2028, Rune 0x2029, # - 0x3000, 0x3000, #
  514. Rune 0x202f, # narrow no-break space
  515. Rune 0x205f, # medium mathematical space
  516. Rune 0x3000, # ideographic space
  517. Rune 0xfeff] # unknown
  518. toupperRanges = [
  519. 0x0061, 0x007a, 468, # a-z A-Z
  520. 0x00e0, 0x00f6, 468, # - -
  521. 0x00f8, 0x00fe, 468, # - -
  522. 0x0256, 0x0257, 295, # - -
  523. 0x0258, 0x0259, 298, # - -
  524. 0x028a, 0x028b, 283, # - -
  525. 0x03ad, 0x03af, 463, # - -
  526. 0x03b1, 0x03c1, 468, # - -
  527. 0x03c3, 0x03cb, 468, # - -
  528. 0x03cd, 0x03ce, 437, # - -
  529. 0x0430, 0x044f, 468, # - -
  530. 0x0451, 0x045c, 420, # - -
  531. 0x045e, 0x045f, 420, # - -
  532. 0x0561, 0x0586, 452, # - -
  533. 0x1f00, 0x1f07, 508, # - -
  534. 0x1f10, 0x1f15, 508, # - -
  535. 0x1f20, 0x1f27, 508, # - -
  536. 0x1f30, 0x1f37, 508, # - -
  537. 0x1f40, 0x1f45, 508, # - -
  538. 0x1f60, 0x1f67, 508, # - -
  539. 0x1f70, 0x1f71, 574, # - -
  540. 0x1f72, 0x1f75, 586, # - -
  541. 0x1f76, 0x1f77, 600, # - -
  542. 0x1f78, 0x1f79, 628, # - -
  543. 0x1f7a, 0x1f7b, 612, # - -
  544. 0x1f7c, 0x1f7d, 626, # - -
  545. 0x1f80, 0x1f87, 508, # - -
  546. 0x1f90, 0x1f97, 508, # - -
  547. 0x1fa0, 0x1fa7, 508, # - -
  548. 0x1fb0, 0x1fb1, 508, # - -
  549. 0x1fd0, 0x1fd1, 508, # - -
  550. 0x1fe0, 0x1fe1, 508, # - -
  551. 0x2170, 0x217f, 484, # - -
  552. 0x24d0, 0x24e9, 474, # - -
  553. 0xff41, 0xff5a, 468] # - -
  554. toupperSinglets = [
  555. 0x00ff, 621, #
  556. 0x0101, 499, #
  557. 0x0103, 499, #
  558. 0x0105, 499, #
  559. 0x0107, 499, #
  560. 0x0109, 499, #
  561. 0x010b, 499, #
  562. 0x010d, 499, #
  563. 0x010f, 499, #
  564. 0x0111, 499, #
  565. 0x0113, 499, #
  566. 0x0115, 499, #
  567. 0x0117, 499, #
  568. 0x0119, 499, #
  569. 0x011b, 499, #
  570. 0x011d, 499, #
  571. 0x011f, 499, #
  572. 0x0121, 499, #
  573. 0x0123, 499, #
  574. 0x0125, 499, #
  575. 0x0127, 499, #
  576. 0x0129, 499, #
  577. 0x012b, 499, #
  578. 0x012d, 499, #
  579. 0x012f, 499, #
  580. 0x0131, 268, # I
  581. 0x0133, 499, #
  582. 0x0135, 499, #
  583. 0x0137, 499, #
  584. 0x013a, 499, #
  585. 0x013c, 499, #
  586. 0x013e, 499, #
  587. 0x0140, 499, #
  588. 0x0142, 499, #
  589. 0x0144, 499, #
  590. 0x0146, 499, #
  591. 0x0148, 499, #
  592. 0x014b, 499, #
  593. 0x014d, 499, #
  594. 0x014f, 499, #
  595. 0x0151, 499, #
  596. 0x0153, 499, #
  597. 0x0155, 499, #
  598. 0x0157, 499, #
  599. 0x0159, 499, #
  600. 0x015b, 499, #
  601. 0x015d, 499, #
  602. 0x015f, 499, #
  603. 0x0161, 499, #
  604. 0x0163, 499, #
  605. 0x0165, 499, #
  606. 0x0167, 499, #
  607. 0x0169, 499, #
  608. 0x016b, 499, #
  609. 0x016d, 499, #
  610. 0x016f, 499, #
  611. 0x0171, 499, #
  612. 0x0173, 499, #
  613. 0x0175, 499, #
  614. 0x0177, 499, #
  615. 0x017a, 499, #
  616. 0x017c, 499, #
  617. 0x017e, 499, #
  618. 0x017f, 200, # S
  619. 0x0183, 499, #
  620. 0x0185, 499, #
  621. 0x0188, 499, #
  622. 0x018c, 499, #
  623. 0x0192, 499, #
  624. 0x0199, 499, #
  625. 0x01a1, 499, #
  626. 0x01a3, 499, #
  627. 0x01a5, 499, #
  628. 0x01a8, 499, #
  629. 0x01ad, 499, #
  630. 0x01b0, 499, #
  631. 0x01b4, 499, #
  632. 0x01b6, 499, #
  633. 0x01b9, 499, #
  634. 0x01bd, 499, #
  635. 0x01c5, 499, #
  636. 0x01c6, 498, #
  637. 0x01c8, 499, #
  638. 0x01c9, 498, #
  639. 0x01cb, 499, #
  640. 0x01cc, 498, #
  641. 0x01ce, 499, #
  642. 0x01d0, 499, #
  643. 0x01d2, 499, #
  644. 0x01d4, 499, #
  645. 0x01d6, 499, #
  646. 0x01d8, 499, #
  647. 0x01da, 499, #
  648. 0x01dc, 499, #
  649. 0x01df, 499, #
  650. 0x01e1, 499, #
  651. 0x01e3, 499, #
  652. 0x01e5, 499, #
  653. 0x01e7, 499, #
  654. 0x01e9, 499, #
  655. 0x01eb, 499, #
  656. 0x01ed, 499, #
  657. 0x01ef, 499, #
  658. 0x01f2, 499, #
  659. 0x01f3, 498, #
  660. 0x01f5, 499, #
  661. 0x01fb, 499, #
  662. 0x01fd, 499, #
  663. 0x01ff, 499, #
  664. 0x0201, 499, #
  665. 0x0203, 499, #
  666. 0x0205, 499, #
  667. 0x0207, 499, #
  668. 0x0209, 499, #
  669. 0x020b, 499, #
  670. 0x020d, 499, #
  671. 0x020f, 499, #
  672. 0x0211, 499, #
  673. 0x0213, 499, #
  674. 0x0215, 499, #
  675. 0x0217, 499, #
  676. 0x0253, 290, #
  677. 0x0254, 294, #
  678. 0x025b, 297, #
  679. 0x0260, 295, #
  680. 0x0263, 293, #
  681. 0x0268, 291, #
  682. 0x0269, 289, #
  683. 0x026f, 289, #
  684. 0x0272, 287, #
  685. 0x0283, 282, #
  686. 0x0288, 282, #
  687. 0x0292, 281, #
  688. 0x03ac, 462, #
  689. 0x03cc, 436, #
  690. 0x03d0, 438, #
  691. 0x03d1, 443, #
  692. 0x03d5, 453, #
  693. 0x03d6, 446, #
  694. 0x03e3, 499, #
  695. 0x03e5, 499, #
  696. 0x03e7, 499, #
  697. 0x03e9, 499, #
  698. 0x03eb, 499, #
  699. 0x03ed, 499, #
  700. 0x03ef, 499, #
  701. 0x03f0, 414, #
  702. 0x03f1, 420, #
  703. 0x0461, 499, #
  704. 0x0463, 499, #
  705. 0x0465, 499, #
  706. 0x0467, 499, #
  707. 0x0469, 499, #
  708. 0x046b, 499, #
  709. 0x046d, 499, #
  710. 0x046f, 499, #
  711. 0x0471, 499, #
  712. 0x0473, 499, #
  713. 0x0475, 499, #
  714. 0x0477, 499, #
  715. 0x0479, 499, #
  716. 0x047b, 499, #
  717. 0x047d, 499, #
  718. 0x047f, 499, #
  719. 0x0481, 499, #
  720. 0x0491, 499, #
  721. 0x0493, 499, #
  722. 0x0495, 499, #
  723. 0x0497, 499, #
  724. 0x0499, 499, #
  725. 0x049b, 499, #
  726. 0x049d, 499, #
  727. 0x049f, 499, #
  728. 0x04a1, 499, #
  729. 0x04a3, 499, #
  730. 0x04a5, 499, #
  731. 0x04a7, 499, #
  732. 0x04a9, 499, #
  733. 0x04ab, 499, #
  734. 0x04ad, 499, #
  735. 0x04af, 499, #
  736. 0x04b1, 499, #
  737. 0x04b3, 499, #
  738. 0x04b5, 499, #
  739. 0x04b7, 499, #
  740. 0x04b9, 499, #
  741. 0x04bb, 499, #
  742. 0x04bd, 499, #
  743. 0x04bf, 499, #
  744. 0x04c2, 499, #
  745. 0x04c4, 499, #
  746. 0x04c8, 499, #
  747. 0x04cc, 499, #
  748. 0x04d1, 499, #
  749. 0x04d3, 499, #
  750. 0x04d5, 499, #
  751. 0x04d7, 499, #
  752. 0x04d9, 499, #
  753. 0x04db, 499, #
  754. 0x04dd, 499, #
  755. 0x04df, 499, #
  756. 0x04e1, 499, #
  757. 0x04e3, 499, #
  758. 0x04e5, 499, #
  759. 0x04e7, 499, #
  760. 0x04e9, 499, #
  761. 0x04eb, 499, #
  762. 0x04ef, 499, #
  763. 0x04f1, 499, #
  764. 0x04f3, 499, #
  765. 0x04f5, 499, #
  766. 0x04f9, 499, #
  767. 0x1e01, 499, #
  768. 0x1e03, 499, #
  769. 0x1e05, 499, #
  770. 0x1e07, 499, #
  771. 0x1e09, 499, #
  772. 0x1e0b, 499, #
  773. 0x1e0d, 499, #
  774. 0x1e0f, 499, #
  775. 0x1e11, 499, #
  776. 0x1e13, 499, #
  777. 0x1e15, 499, #
  778. 0x1e17, 499, #
  779. 0x1e19, 499, #
  780. 0x1e1b, 499, #
  781. 0x1e1d, 499, #
  782. 0x1e1f, 499, #
  783. 0x1e21, 499, #
  784. 0x1e23, 499, #
  785. 0x1e25, 499, #
  786. 0x1e27, 499, #
  787. 0x1e29, 499, #
  788. 0x1e2b, 499, #
  789. 0x1e2d, 499, #
  790. 0x1e2f, 499, #
  791. 0x1e31, 499, #
  792. 0x1e33, 499, #
  793. 0x1e35, 499, #
  794. 0x1e37, 499, #
  795. 0x1e39, 499, #
  796. 0x1e3b, 499, #
  797. 0x1e3d, 499, #
  798. 0x1e3f, 499, #
  799. 0x1e41, 499, #
  800. 0x1e43, 499, #
  801. 0x1e45, 499, #
  802. 0x1e47, 499, #
  803. 0x1e49, 499, #
  804. 0x1e4b, 499, #
  805. 0x1e4d, 499, #
  806. 0x1e4f, 499, #
  807. 0x1e51, 499, #
  808. 0x1e53, 499, #
  809. 0x1e55, 499, #
  810. 0x1e57, 499, #
  811. 0x1e59, 499, #
  812. 0x1e5b, 499, #
  813. 0x1e5d, 499, #
  814. 0x1e5f, 499, #
  815. 0x1e61, 499, #
  816. 0x1e63, 499, #
  817. 0x1e65, 499, #
  818. 0x1e67, 499, #
  819. 0x1e69, 499, #
  820. 0x1e6b, 499, #
  821. 0x1e6d, 499, #
  822. 0x1e6f, 499, #
  823. 0x1e71, 499, #
  824. 0x1e73, 499, #
  825. 0x1e75, 499, #
  826. 0x1e77, 499, #
  827. 0x1e79, 499, #
  828. 0x1e7b, 499, #
  829. 0x1e7d, 499, #
  830. 0x1e7f, 499, #
  831. 0x1e81, 499, #
  832. 0x1e83, 499, #
  833. 0x1e85, 499, #
  834. 0x1e87, 499, #
  835. 0x1e89, 499, #
  836. 0x1e8b, 499, #
  837. 0x1e8d, 499, #
  838. 0x1e8f, 499, #
  839. 0x1e91, 499, #
  840. 0x1e93, 499, #
  841. 0x1e95, 499, #
  842. 0x1ea1, 499, #
  843. 0x1ea3, 499, #
  844. 0x1ea5, 499, #
  845. 0x1ea7, 499, #
  846. 0x1ea9, 499, #
  847. 0x1eab, 499, #
  848. 0x1ead, 499, #
  849. 0x1eaf, 499, #
  850. 0x1eb1, 499, #
  851. 0x1eb3, 499, #
  852. 0x1eb5, 499, #
  853. 0x1eb7, 499, #
  854. 0x1eb9, 499, #
  855. 0x1ebb, 499, #
  856. 0x1ebd, 499, #
  857. 0x1ebf, 499, #
  858. 0x1ec1, 499, #
  859. 0x1ec3, 499, #
  860. 0x1ec5, 499, #
  861. 0x1ec7, 499, #
  862. 0x1ec9, 499, #
  863. 0x1ecb, 499, #
  864. 0x1ecd, 499, #
  865. 0x1ecf, 499, #
  866. 0x1ed1, 499, #
  867. 0x1ed3, 499, #
  868. 0x1ed5, 499, #
  869. 0x1ed7, 499, #
  870. 0x1ed9, 499, #
  871. 0x1edb, 499, #
  872. 0x1edd, 499, #
  873. 0x1edf, 499, #
  874. 0x1ee1, 499, #
  875. 0x1ee3, 499, #
  876. 0x1ee5, 499, #
  877. 0x1ee7, 499, #
  878. 0x1ee9, 499, #
  879. 0x1eeb, 499, #
  880. 0x1eed, 499, #
  881. 0x1eef, 499, #
  882. 0x1ef1, 499, #
  883. 0x1ef3, 499, #
  884. 0x1ef5, 499, #
  885. 0x1ef7, 499, #
  886. 0x1ef9, 499, #
  887. 0x1f51, 508, #
  888. 0x1f53, 508, #
  889. 0x1f55, 508, #
  890. 0x1f57, 508, #
  891. 0x1fb3, 509, #
  892. 0x1fc3, 509, #
  893. 0x1fe5, 507, #
  894. 0x1ff3, 509] #
  895. tolowerRanges = [
  896. 0x0041, 0x005a, 532, # A-Z a-z
  897. 0x00c0, 0x00d6, 532, # - -
  898. 0x00d8, 0x00de, 532, # - -
  899. 0x0189, 0x018a, 705, # - -
  900. 0x018e, 0x018f, 702, # - -
  901. 0x01b1, 0x01b2, 717, # - -
  902. 0x0388, 0x038a, 537, # - -
  903. 0x038e, 0x038f, 563, # - -
  904. 0x0391, 0x03a1, 532, # - -
  905. 0x03a3, 0x03ab, 532, # - -
  906. 0x0401, 0x040c, 580, # - -
  907. 0x040e, 0x040f, 580, # - -
  908. 0x0410, 0x042f, 532, # - -
  909. 0x0531, 0x0556, 548, # - -
  910. 0x10a0, 0x10c5, 548, # - -
  911. 0x1f08, 0x1f0f, 492, # - -
  912. 0x1f18, 0x1f1d, 492, # - -
  913. 0x1f28, 0x1f2f, 492, # - -
  914. 0x1f38, 0x1f3f, 492, # - -
  915. 0x1f48, 0x1f4d, 492, # - -
  916. 0x1f68, 0x1f6f, 492, # - -
  917. 0x1f88, 0x1f8f, 492, # - -
  918. 0x1f98, 0x1f9f, 492, # - -
  919. 0x1fa8, 0x1faf, 492, # - -
  920. 0x1fb8, 0x1fb9, 492, # - -
  921. 0x1fba, 0x1fbb, 426, # - -
  922. 0x1fc8, 0x1fcb, 414, # - -
  923. 0x1fd8, 0x1fd9, 492, # - -
  924. 0x1fda, 0x1fdb, 400, # - -
  925. 0x1fe8, 0x1fe9, 492, # - -
  926. 0x1fea, 0x1feb, 388, # - -
  927. 0x1ff8, 0x1ff9, 372, # - -
  928. 0x1ffa, 0x1ffb, 374, # - -
  929. 0x2160, 0x216f, 516, # - -
  930. 0x24b6, 0x24cf, 526, # - -
  931. 0xff21, 0xff3a, 532] # - -
  932. tolowerSinglets = [
  933. 0x0100, 501, #
  934. 0x0102, 501, #
  935. 0x0104, 501, #
  936. 0x0106, 501, #
  937. 0x0108, 501, #
  938. 0x010a, 501, #
  939. 0x010c, 501, #
  940. 0x010e, 501, #
  941. 0x0110, 501, #
  942. 0x0112, 501, #
  943. 0x0114, 501, #
  944. 0x0116, 501, #
  945. 0x0118, 501, #
  946. 0x011a, 501, #
  947. 0x011c, 501, #
  948. 0x011e, 501, #
  949. 0x0120, 501, #
  950. 0x0122, 501, #
  951. 0x0124, 501, #
  952. 0x0126, 501, #
  953. 0x0128, 501, #
  954. 0x012a, 501, #
  955. 0x012c, 501, #
  956. 0x012e, 501, #
  957. 0x0130, 301, # i
  958. 0x0132, 501, #
  959. 0x0134, 501, #
  960. 0x0136, 501, #
  961. 0x0139, 501, #
  962. 0x013b, 501, #
  963. 0x013d, 501, #
  964. 0x013f, 501, #
  965. 0x0141, 501, #
  966. 0x0143, 501, #
  967. 0x0145, 501, #
  968. 0x0147, 501, #
  969. 0x014a, 501, #
  970. 0x014c, 501, #
  971. 0x014e, 501, #
  972. 0x0150, 501, #
  973. 0x0152, 501, #
  974. 0x0154, 501, #
  975. 0x0156, 501, #
  976. 0x0158, 501, #
  977. 0x015a, 501, #
  978. 0x015c, 501, #
  979. 0x015e, 501, #
  980. 0x0160, 501, #
  981. 0x0162, 501, #
  982. 0x0164, 501, #
  983. 0x0166, 501, #
  984. 0x0168, 501, #
  985. 0x016a, 501, #
  986. 0x016c, 501, #
  987. 0x016e, 501, #
  988. 0x0170, 501, #
  989. 0x0172, 501, #
  990. 0x0174, 501, #
  991. 0x0176, 501, #
  992. 0x0178, 379, #
  993. 0x0179, 501, #
  994. 0x017b, 501, #
  995. 0x017d, 501, #
  996. 0x0181, 710, #
  997. 0x0182, 501, #
  998. 0x0184, 501, #
  999. 0x0186, 706, #
  1000. 0x0187, 501, #
  1001. 0x018b, 501, #
  1002. 0x0190, 703, #
  1003. 0x0191, 501, #
  1004. 0x0193, 705, #
  1005. 0x0194, 707, #
  1006. 0x0196, 711, #
  1007. 0x0197, 709, #
  1008. 0x0198, 501, #
  1009. 0x019c, 711, #
  1010. 0x019d, 713, #
  1011. 0x01a0, 501, #
  1012. 0x01a2, 501, #
  1013. 0x01a4, 501, #
  1014. 0x01a7, 501, #
  1015. 0x01a9, 718, #
  1016. 0x01ac, 501, #
  1017. 0x01ae, 718, #
  1018. 0x01af, 501, #
  1019. 0x01b3, 501, #
  1020. 0x01b5, 501, #
  1021. 0x01b7, 719, #
  1022. 0x01b8, 501, #
  1023. 0x01bc, 501, #
  1024. 0x01c4, 502, #
  1025. 0x01c5, 501, #
  1026. 0x01c7, 502, #
  1027. 0x01c8, 501, #
  1028. 0x01ca, 502, #
  1029. 0x01cb, 501, #
  1030. 0x01cd, 501, #
  1031. 0x01cf, 501, #
  1032. 0x01d1, 501, #
  1033. 0x01d3, 501, #
  1034. 0x01d5, 501, #
  1035. 0x01d7, 501, #
  1036. 0x01d9, 501, #
  1037. 0x01db, 501, #
  1038. 0x01de, 501, #
  1039. 0x01e0, 501, #
  1040. 0x01e2, 501, #
  1041. 0x01e4, 501, #
  1042. 0x01e6, 501, #
  1043. 0x01e8, 501, #
  1044. 0x01ea, 501, #
  1045. 0x01ec, 501, #
  1046. 0x01ee, 501, #
  1047. 0x01f1, 502, #
  1048. 0x01f2, 501, #
  1049. 0x01f4, 501, #
  1050. 0x01fa, 501, #
  1051. 0x01fc, 501, #
  1052. 0x01fe, 501, #
  1053. 0x0200, 501, #
  1054. 0x0202, 501, #
  1055. 0x0204, 501, #
  1056. 0x0206, 501, #
  1057. 0x0208, 501, #
  1058. 0x020a, 501, #
  1059. 0x020c, 501, #
  1060. 0x020e, 501, #
  1061. 0x0210, 501, #
  1062. 0x0212, 501, #
  1063. 0x0214, 501, #
  1064. 0x0216, 501, #
  1065. 0x0386, 538, #
  1066. 0x038c, 564, #
  1067. 0x03e2, 501, #
  1068. 0x03e4, 501, #
  1069. 0x03e6, 501, #
  1070. 0x03e8, 501, #
  1071. 0x03ea, 501, #
  1072. 0x03ec, 501, #
  1073. 0x03ee, 501, #
  1074. 0x0460, 501, #
  1075. 0x0462, 501, #
  1076. 0x0464, 501, #
  1077. 0x0466, 501, #
  1078. 0x0468, 501, #
  1079. 0x046a, 501, #
  1080. 0x046c, 501, #
  1081. 0x046e, 501, #
  1082. 0x0470, 501, #
  1083. 0x0472, 501, #
  1084. 0x0474, 501, #
  1085. 0x0476, 501, #
  1086. 0x0478, 501, #
  1087. 0x047a, 501, #
  1088. 0x047c, 501, #
  1089. 0x047e, 501, #
  1090. 0x0480, 501, #
  1091. 0x0490, 501, #
  1092. 0x0492, 501, #
  1093. 0x0494, 501, #
  1094. 0x0496, 501, #
  1095. 0x0498, 501, #
  1096. 0x049a, 501, #
  1097. 0x049c, 501, #
  1098. 0x049e, 501, #
  1099. 0x04a0, 501, #
  1100. 0x04a2, 501, #
  1101. 0x04a4, 501, #
  1102. 0x04a6, 501, #
  1103. 0x04a8, 501, #
  1104. 0x04aa, 501, #
  1105. 0x04ac, 501, #
  1106. 0x04ae, 501, #
  1107. 0x04b0, 501, #
  1108. 0x04b2, 501, #
  1109. 0x04b4, 501, #
  1110. 0x04b6, 501, #
  1111. 0x04b8, 501, #
  1112. 0x04ba, 501, #
  1113. 0x04bc, 501, #
  1114. 0x04be, 501, #
  1115. 0x04c1, 501, #
  1116. 0x04c3, 501, #
  1117. 0x04c7, 501, #
  1118. 0x04cb, 501, #
  1119. 0x04d0, 501, #
  1120. 0x04d2, 501, #
  1121. 0x04d4, 501, #
  1122. 0x04d6, 501, #
  1123. 0x04d8, 501, #
  1124. 0x04da, 501, #
  1125. 0x04dc, 501, #
  1126. 0x04de, 501, #
  1127. 0x04e0, 501, #
  1128. 0x04e2, 501, #
  1129. 0x04e4, 501, #
  1130. 0x04e6, 501, #
  1131. 0x04e8, 501, #
  1132. 0x04ea, 501, #
  1133. 0x04ee, 501, #
  1134. 0x04f0, 501, #
  1135. 0x04f2, 501, #
  1136. 0x04f4, 501, #
  1137. 0x04f8, 501, #
  1138. 0x1e00, 501, #
  1139. 0x1e02, 501, #
  1140. 0x1e04, 501, #
  1141. 0x1e06, 501, #
  1142. 0x1e08, 501, #
  1143. 0x1e0a, 501, #
  1144. 0x1e0c, 501, #
  1145. 0x1e0e, 501, #
  1146. 0x1e10, 501, #
  1147. 0x1e12, 501, #
  1148. 0x1e14, 501, #
  1149. 0x1e16, 501, #
  1150. 0x1e18, 501, #
  1151. 0x1e1a, 501, #
  1152. 0x1e1c, 501, #
  1153. 0x1e1e, 501, #
  1154. 0x1e20, 501, #
  1155. 0x1e22, 501, #
  1156. 0x1e24, 501, #
  1157. 0x1e26, 501, #
  1158. 0x1e28, 501, #
  1159. 0x1e2a, 501, #
  1160. 0x1e2c, 501, #
  1161. 0x1e2e, 501, #
  1162. 0x1e30, 501, #
  1163. 0x1e32, 501, #
  1164. 0x1e34, 501, #
  1165. 0x1e36, 501, #
  1166. 0x1e38, 501, #
  1167. 0x1e3a, 501, #
  1168. 0x1e3c, 501, #
  1169. 0x1e3e, 501, #
  1170. 0x1e40, 501, #
  1171. 0x1e42, 501, #
  1172. 0x1e44, 501, #
  1173. 0x1e46, 501, #
  1174. 0x1e48, 501, #
  1175. 0x1e4a, 501, #
  1176. 0x1e4c, 501, #
  1177. 0x1e4e, 501, #
  1178. 0x1e50, 501, #
  1179. 0x1e52, 501, #
  1180. 0x1e54, 501, #
  1181. 0x1e56, 501, #
  1182. 0x1e58, 501, #
  1183. 0x1e5a, 501, #
  1184. 0x1e5c, 501, #
  1185. 0x1e5e, 501, #
  1186. 0x1e60, 501, #
  1187. 0x1e62, 501, #
  1188. 0x1e64, 501, #
  1189. 0x1e66, 501, #
  1190. 0x1e68, 501, #
  1191. 0x1e6a, 501, #
  1192. 0x1e6c, 501, #
  1193. 0x1e6e, 501, #
  1194. 0x1e70, 501, #
  1195. 0x1e72, 501, #
  1196. 0x1e74, 501, #
  1197. 0x1e76, 501, #
  1198. 0x1e78, 501, #
  1199. 0x1e7a, 501, #
  1200. 0x1e7c, 501, #
  1201. 0x1e7e, 501, #
  1202. 0x1e80, 501, #
  1203. 0x1e82, 501, #
  1204. 0x1e84, 501, #
  1205. 0x1e86, 501, #
  1206. 0x1e88, 501, #
  1207. 0x1e8a, 501, #
  1208. 0x1e8c, 501, #
  1209. 0x1e8e, 501, #
  1210. 0x1e90, 501, #
  1211. 0x1e92, 501, #
  1212. 0x1e94, 501, #
  1213. 0x1ea0, 501, #
  1214. 0x1ea2, 501, #
  1215. 0x1ea4, 501, #
  1216. 0x1ea6, 501, #
  1217. 0x1ea8, 501, #
  1218. 0x1eaa, 501, #
  1219. 0x1eac, 501, #
  1220. 0x1eae, 501, #
  1221. 0x1eb0, 501, #
  1222. 0x1eb2, 501, #
  1223. 0x1eb4, 501, #
  1224. 0x1eb6, 501, #
  1225. 0x1eb8, 501, #
  1226. 0x1eba, 501, #
  1227. 0x1ebc, 501, #
  1228. 0x1ebe, 501, #
  1229. 0x1ec0, 501, #
  1230. 0x1ec2, 501, #
  1231. 0x1ec4, 501, #
  1232. 0x1ec6, 501, #
  1233. 0x1ec8, 501, #
  1234. 0x1eca, 501, #
  1235. 0x1ecc, 501, #
  1236. 0x1ece, 501, #
  1237. 0x1ed0, 501, #
  1238. 0x1ed2, 501, #
  1239. 0x1ed4, 501, #
  1240. 0x1ed6, 501, #
  1241. 0x1ed8, 501, #
  1242. 0x1eda, 501, #
  1243. 0x1edc, 501, #
  1244. 0x1ede, 501, #
  1245. 0x1ee0, 501, #
  1246. 0x1ee2, 501, #
  1247. 0x1ee4, 501, #
  1248. 0x1ee6, 501, #
  1249. 0x1ee8, 501, #
  1250. 0x1eea, 501, #
  1251. 0x1eec, 501, #
  1252. 0x1eee, 501, #
  1253. 0x1ef0, 501, #
  1254. 0x1ef2, 501, #
  1255. 0x1ef4, 501, #
  1256. 0x1ef6, 501, #
  1257. 0x1ef8, 501, #
  1258. 0x1f59, 492, #
  1259. 0x1f5b, 492, #
  1260. 0x1f5d, 492, #
  1261. 0x1f5f, 492, #
  1262. 0x1fbc, 491, #
  1263. 0x1fcc, 491, #
  1264. 0x1fec, 493, #
  1265. 0x1ffc, 491] #
  1266. toTitleSinglets = [
  1267. 0x01c4, 501, #
  1268. 0x01c6, 499, #
  1269. 0x01c7, 501, #
  1270. 0x01c9, 499, #
  1271. 0x01ca, 501, #
  1272. 0x01cc, 499, #
  1273. 0x01f1, 501, #
  1274. 0x01f3, 499] #
  1275. proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int =
  1276. var n = len
  1277. var t = 0
  1278. while n > 1:
  1279. var m = n div 2
  1280. var p = t + m*stride
  1281. if c >= tab[p]:
  1282. t = p
  1283. n = n-m
  1284. else:
  1285. n = m
  1286. if n != 0 and c >= tab[t]:
  1287. return t
  1288. return -1
  1289. proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} =
  1290. ## Converts ``c`` into lower case. This works for any Unicode character.
  1291. ## If possible, prefer ``toLower`` over ``toUpper``.
  1292. var c = RuneImpl(c)
  1293. var p = binarySearch(c, tolowerRanges, len(tolowerRanges) div 3, 3)
  1294. if p >= 0 and c >= tolowerRanges[p] and c <= tolowerRanges[p+1]:
  1295. return Rune(c + tolowerRanges[p+2] - 500)
  1296. p = binarySearch(c, tolowerSinglets, len(tolowerSinglets) div 2, 2)
  1297. if p >= 0 and c == tolowerSinglets[p]:
  1298. return Rune(c + tolowerSinglets[p+1] - 500)
  1299. return Rune(c)
  1300. proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} =
  1301. ## Converts ``c`` into upper case. This works for any Unicode character.
  1302. ## If possible, prefer ``toLower`` over ``toUpper``.
  1303. var c = RuneImpl(c)
  1304. var p = binarySearch(c, toupperRanges, len(toupperRanges) div 3, 3)
  1305. if p >= 0 and c >= toupperRanges[p] and c <= toupperRanges[p+1]:
  1306. return Rune(c + toupperRanges[p+2] - 500)
  1307. p = binarySearch(c, toupperSinglets, len(toupperSinglets) div 2, 2)
  1308. if p >= 0 and c == toupperSinglets[p]:
  1309. return Rune(c + toupperSinglets[p+1] - 500)
  1310. return Rune(c)
  1311. proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} =
  1312. ## Converts ``c`` to title case
  1313. var c = RuneImpl(c)
  1314. var p = binarySearch(c, toTitleSinglets, len(toTitleSinglets) div 2, 2)
  1315. if p >= 0 and c == toTitleSinglets[p]:
  1316. return Rune(c + toTitleSinglets[p+1] - 500)
  1317. return Rune(c)
  1318. proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1319. ## Returns true iff ``c`` is a lower case Unicode character.
  1320. ## If possible, prefer ``isLower`` over ``isUpper``.
  1321. var c = RuneImpl(c)
  1322. # Note: toUpperRanges is correct here!
  1323. var p = binarySearch(c, toupperRanges, len(toupperRanges) div 3, 3)
  1324. if p >= 0 and c >= toupperRanges[p] and c <= toupperRanges[p+1]:
  1325. return true
  1326. p = binarySearch(c, toupperSinglets, len(toupperSinglets) div 2, 2)
  1327. if p >= 0 and c == toupperSinglets[p]:
  1328. return true
  1329. proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1330. ## Returns true iff ``c`` is a upper case Unicode character.
  1331. ## If possible, prefer ``isLower`` over ``isUpper``.
  1332. var c = RuneImpl(c)
  1333. # Note: toLowerRanges is correct here!
  1334. var p = binarySearch(c, tolowerRanges, len(tolowerRanges) div 3, 3)
  1335. if p >= 0 and c >= tolowerRanges[p] and c <= tolowerRanges[p+1]:
  1336. return true
  1337. p = binarySearch(c, tolowerSinglets, len(tolowerSinglets) div 2, 2)
  1338. if p >= 0 and c == tolowerSinglets[p]:
  1339. return true
  1340. proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1341. ## Returns true iff ``c`` is an *alpha* Unicode character (i.e., a letter)
  1342. if isUpper(c) or isLower(c):
  1343. return true
  1344. var c = RuneImpl(c)
  1345. var p = binarySearch(c, alphaRanges, len(alphaRanges) div 2, 2)
  1346. if p >= 0 and c >= alphaRanges[p] and c <= alphaRanges[p+1]:
  1347. return true
  1348. p = binarySearch(c, alphaSinglets, len(alphaSinglets), 1)
  1349. if p >= 0 and c == alphaSinglets[p]:
  1350. return true
  1351. proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1352. ## Returns true iff ``c`` is a Unicode titlecase character
  1353. return isUpper(c) and isLower(c)
  1354. proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1355. ## Returns true iff ``c`` is a Unicode whitespace character
  1356. var c = RuneImpl(c)
  1357. var p = binarySearch(c, spaceRanges, len(spaceRanges) div 2, 2)
  1358. if p >= 0 and c >= spaceRanges[p] and c <= spaceRanges[p+1]:
  1359. return true
  1360. proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1361. ## Returns true iff ``c`` is a Unicode combining character
  1362. var c = RuneImpl(c)
  1363. # Optimized to return false immediately for ASCII
  1364. return c >= 0x0300 and (c <= 0x036f or
  1365. (c >= 0x1ab0 and c <= 0x1aff) or
  1366. (c >= 0x1dc0 and c <= 0x1dff) or
  1367. (c >= 0x20d0 and c <= 0x20ff) or
  1368. (c >= 0xfe20 and c <= 0xfe2f))
  1369. template runeCheck(s, runeProc) =
  1370. ## Common code for isAlpha and isSpace.
  1371. result = if len(s) == 0: false else: true
  1372. var
  1373. i = 0
  1374. rune: Rune
  1375. while i < len(s) and result:
  1376. fastRuneAt(s, i, rune, doInc=true)
  1377. result = runeProc(rune) and result
  1378. proc isAlpha*(s: string): bool {.noSideEffect, procvar,
  1379. rtl, extern: "nuc$1Str".} =
  1380. ## Returns true iff `s` contains all alphabetic unicode characters.
  1381. runeCheck(s, isAlpha)
  1382. proc isSpace*(s: string): bool {.noSideEffect, procvar,
  1383. rtl, extern: "nuc$1Str".} =
  1384. ## Returns true iff `s` contains all whitespace unicode characters.
  1385. runeCheck(s, isWhiteSpace)
  1386. template runeCaseCheck(s, runeProc, skipNonAlpha) =
  1387. ## Common code for rune.isLower and rune.isUpper.
  1388. if len(s) == 0: return false
  1389. var
  1390. i = 0
  1391. rune: Rune
  1392. hasAtleastOneAlphaRune = false
  1393. while i < len(s):
  1394. fastRuneAt(s, i, rune, doInc=true)
  1395. if skipNonAlpha:
  1396. var runeIsAlpha = isAlpha(rune)
  1397. if not hasAtleastOneAlphaRune:
  1398. hasAtleastOneAlphaRune = runeIsAlpha
  1399. if runeIsAlpha and (not runeProc(rune)):
  1400. return false
  1401. else:
  1402. if not runeProc(rune):
  1403. return false
  1404. return if skipNonAlpha: hasAtleastOneAlphaRune else: true
  1405. proc isLower*(s: string, skipNonAlpha: bool): bool {.
  1406. deprecated: "Deprecated since version 0.20 since its semantics are unclear".} =
  1407. ## Checks whether ``s`` is lower case.
  1408. ##
  1409. ## If ``skipNonAlpha`` is true, returns true if all alphabetical
  1410. ## runes in ``s`` are lower case. Returns false if none of the
  1411. ## runes in ``s`` are alphabetical.
  1412. ##
  1413. ## If ``skipNonAlpha`` is false, returns true only if all runes in
  1414. ## ``s`` are alphabetical and lower case.
  1415. ##
  1416. ## For either value of ``skipNonAlpha``, returns false if ``s`` is
  1417. ## an empty string.
  1418. runeCaseCheck(s, isLower, skipNonAlpha)
  1419. proc isUpper*(s: string, skipNonAlpha: bool): bool {.
  1420. deprecated: "Deprecated since version 0.20 since its semantics are unclear".} =
  1421. ## Checks whether ``s`` is upper case.
  1422. ##
  1423. ## If ``skipNonAlpha`` is true, returns true if all alphabetical
  1424. ## runes in ``s`` are upper case. Returns false if none of the
  1425. ## runes in ``s`` are alphabetical.
  1426. ##
  1427. ## If ``skipNonAlpha`` is false, returns true only if all runes in
  1428. ## ``s`` are alphabetical and upper case.
  1429. ##
  1430. ## For either value of ``skipNonAlpha``, returns false if ``s`` is
  1431. ## an empty string.
  1432. runeCaseCheck(s, isUpper, skipNonAlpha)
  1433. template convertRune(s, runeProc) =
  1434. ## Convert runes in `s` using `runeProc` as the converter.
  1435. result = newString(len(s))
  1436. var
  1437. i = 0
  1438. lastIndex = 0
  1439. rune: Rune
  1440. while i < len(s):
  1441. lastIndex = i
  1442. fastRuneAt(s, i, rune, doInc=true)
  1443. rune = runeProc(rune)
  1444. rune.fastToUTF8Copy(result, lastIndex)
  1445. proc toUpper*(s: string): string {.noSideEffect, procvar,
  1446. rtl, extern: "nuc$1Str".} =
  1447. ## Converts `s` into upper-case unicode characters.
  1448. convertRune(s, toUpper)
  1449. proc toLower*(s: string): string {.noSideEffect, procvar,
  1450. rtl, extern: "nuc$1Str".} =
  1451. ## Converts `s` into lower-case unicode characters.
  1452. convertRune(s, toLower)
  1453. proc swapCase*(s: string): string {.noSideEffect, procvar,
  1454. rtl, extern: "nuc$1".} =
  1455. ## Swaps the case of unicode characters in `s`
  1456. ##
  1457. ## Returns a new string such that the cases of all unicode characters
  1458. ## are swapped if possible
  1459. var
  1460. i = 0
  1461. lastIndex = 0
  1462. rune: Rune
  1463. result = newString(len(s))
  1464. while i < len(s):
  1465. lastIndex = i
  1466. fastRuneAt(s, i, rune)
  1467. if rune.isUpper():
  1468. rune = rune.toLower()
  1469. elif rune.isLower():
  1470. rune = rune.toUpper()
  1471. rune.fastToUTF8Copy(result, lastIndex)
  1472. proc capitalize*(s: string): string {.noSideEffect, procvar,
  1473. rtl, extern: "nuc$1".} =
  1474. ## Converts the first character of `s` into an upper-case unicode character.
  1475. if len(s) == 0:
  1476. return s
  1477. var
  1478. rune: Rune
  1479. i = 0
  1480. fastRuneAt(s, i, rune, doInc=true)
  1481. result = $toUpper(rune) & substr(s, i)
  1482. proc translate*(s: string, replacements: proc(key: string): string): string {.
  1483. rtl, extern: "nuc$1".} =
  1484. ## Translates words in a string using the `replacements` proc to substitute
  1485. ## words inside `s` with their replacements
  1486. ##
  1487. ## `replacements` is any proc that takes a word and returns
  1488. ## a new word to fill it's place.
  1489. # Allocate memory for the new string based on the old one.
  1490. # If the new string length is less than the old, no allocations
  1491. # will be needed. If the new string length is greater than the
  1492. # old, then maybe only one allocation is needed
  1493. result = newStringOfCap(s.len)
  1494. var
  1495. index = 0
  1496. lastIndex = 0
  1497. wordStart = 0
  1498. inWord = false
  1499. rune: Rune
  1500. while index < len(s):
  1501. lastIndex = index
  1502. fastRuneAt(s, index, rune)
  1503. let whiteSpace = rune.isWhiteSpace()
  1504. if whiteSpace and inWord:
  1505. # If we've reached the end of a word
  1506. let word = s[wordStart ..< lastIndex]
  1507. result.add(replacements(word))
  1508. result.add($rune)
  1509. inWord = false
  1510. elif not whiteSpace and not inWord:
  1511. # If we've hit a non space character and
  1512. # are not currently in a word, track
  1513. # the starting index of the word
  1514. inWord = true
  1515. wordStart = lastIndex
  1516. elif whiteSpace:
  1517. result.add($rune)
  1518. if wordStart < len(s) and inWord:
  1519. # Get the trailing word at the end
  1520. let word = s[wordStart .. ^1]
  1521. result.add(replacements(word))
  1522. proc title*(s: string): string {.noSideEffect, procvar,
  1523. rtl, extern: "nuc$1".} =
  1524. ## Converts `s` to a unicode title.
  1525. ##
  1526. ## Returns a new string such that the first character
  1527. ## in each word inside `s` is capitalized
  1528. var
  1529. i = 0
  1530. lastIndex = 0
  1531. rune: Rune
  1532. result = newString(len(s))
  1533. var firstRune = true
  1534. while i < len(s):
  1535. lastIndex = i
  1536. fastRuneAt(s, i, rune)
  1537. if not rune.isWhiteSpace() and firstRune:
  1538. rune = rune.toUpper()
  1539. firstRune = false
  1540. elif rune.isWhiteSpace():
  1541. firstRune = true
  1542. rune.fastToUTF8Copy(result, lastIndex)
  1543. proc isTitle*(s: string): bool {.noSideEffect, procvar,
  1544. rtl, extern: "nuc$1Str",
  1545. deprecated: "Deprecated since version 0.20 since its semantics are unclear".}=
  1546. ## Checks whether or not `s` is a unicode title.
  1547. ##
  1548. ## Returns true if the first character in each word inside `s`
  1549. ## are upper case and there is at least one character in `s`.
  1550. if s.len == 0:
  1551. return false
  1552. result = true
  1553. var
  1554. i = 0
  1555. rune: Rune
  1556. var firstRune = true
  1557. while i < len(s) and result:
  1558. fastRuneAt(s, i, rune, doInc=true)
  1559. if not rune.isWhiteSpace() and firstRune:
  1560. result = rune.isUpper() and result
  1561. firstRune = false
  1562. elif rune.isWhiteSpace():
  1563. firstRune = true
  1564. iterator runes*(s: string): Rune =
  1565. ## Iterates over any unicode character of the string ``s`` returning runes
  1566. var
  1567. i = 0
  1568. result: Rune
  1569. while i < len(s):
  1570. fastRuneAt(s, i, result, true)
  1571. yield result
  1572. iterator utf8*(s: string): string =
  1573. ## Iterates over any unicode character of the string ``s`` returning utf8 values
  1574. var o = 0
  1575. while o < s.len:
  1576. let n = runeLenAt(s, o)
  1577. yield s[o.. (o+n-1)]
  1578. o += n
  1579. proc toRunes*(s: string): seq[Rune] =
  1580. ## Obtains a sequence containing the Runes in ``s``
  1581. result = newSeq[Rune]()
  1582. for r in s.runes:
  1583. result.add(r)
  1584. proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1", procvar.} =
  1585. ## Compares two UTF-8 strings and ignores the case. Returns:
  1586. ##
  1587. ## | 0 iff a == b
  1588. ## | < 0 iff a < b
  1589. ## | > 0 iff a > b
  1590. var i = 0
  1591. var j = 0
  1592. var ar, br: Rune
  1593. while i < a.len and j < b.len:
  1594. # slow path:
  1595. fastRuneAt(a, i, ar)
  1596. fastRuneAt(b, j, br)
  1597. result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br))
  1598. if result != 0: return
  1599. result = a.len - b.len
  1600. proc reversed*(s: string): string =
  1601. ## Returns the reverse of ``s``, interpreting it as Unicode characters.
  1602. ## Unicode combining characters are correctly interpreted as well:
  1603. ##
  1604. ## .. code-block:: nim
  1605. ##
  1606. ## assert reversed("Reverse this!") == "!siht esreveR"
  1607. ## assert reversed("先秦兩漢") == "漢兩秦先"
  1608. ## assert reversed("as⃝df̅") == "f̅ds⃝a"
  1609. ## assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  1610. var
  1611. i = 0
  1612. lastI = 0
  1613. newPos = len(s) - 1
  1614. blockPos = 0
  1615. r: Rune
  1616. template reverseUntil(pos) =
  1617. var j = pos - 1
  1618. while j > blockPos:
  1619. result[newPos] = s[j]
  1620. dec j
  1621. dec newPos
  1622. blockPos = pos - 1
  1623. result = newString(len(s))
  1624. while i < len(s):
  1625. lastI = i
  1626. fastRuneAt(s, i, r, true)
  1627. if not isCombining(r):
  1628. reverseUntil(lastI)
  1629. reverseUntil(len(s))
  1630. proc graphemeLen*(s: string; i: Natural): Natural =
  1631. ## The number of bytes belonging to 's[i]' including following combining
  1632. ## characters.
  1633. var j = i.int
  1634. var r, r2: Rune
  1635. if j < s.len:
  1636. fastRuneAt(s, j, r, true)
  1637. result = j-i
  1638. while j < s.len:
  1639. fastRuneAt(s, j, r2, true)
  1640. if not isCombining(r2): break
  1641. result = j-i
  1642. proc lastRune*(s: string; last: int): (Rune, int) =
  1643. ## length of the last rune in 's[0..last]'. Returns the rune and its length
  1644. ## in bytes.
  1645. if s[last] <= chr(127):
  1646. result = (Rune(s[last]), 1)
  1647. else:
  1648. var L = 0
  1649. while last-L >= 0 and ord(s[last-L]) shr 6 == 0b10: inc(L)
  1650. var r: Rune
  1651. fastRuneAt(s, last-L, r, false)
  1652. result = (r, L+1)
  1653. proc size*(r: Rune): int {.noSideEffect.} =
  1654. ## Returns the number of bytes the rune ``r`` takes.
  1655. let v = r.uint32
  1656. if v <= 0x007F: result = 1
  1657. elif v <= 0x07FF: result = 2
  1658. elif v <= 0xFFFF: result = 3
  1659. elif v <= 0x1FFFFF: result = 4
  1660. elif v <= 0x3FFFFFF: result = 5
  1661. elif v <= 0x7FFFFFFF: result = 6
  1662. else: result = 1
  1663. # --------- Private templates for different split separators -----------
  1664. proc stringHasSep(s: string, index: int, seps: openarray[Rune]): bool =
  1665. var rune: Rune
  1666. fastRuneAt(s, index, rune, false)
  1667. return seps.contains(rune)
  1668. proc stringHasSep(s: string, index: int, sep: Rune): bool =
  1669. var rune: Rune
  1670. fastRuneAt(s, index, rune, false)
  1671. return sep == rune
  1672. template splitCommon(s, sep, maxsplit: untyped, sepLen: int = -1) =
  1673. ## Common code for split procedures
  1674. var
  1675. last = 0
  1676. splits = maxsplit
  1677. if len(s) > 0:
  1678. while last <= len(s):
  1679. var first = last
  1680. while last < len(s) and not stringHasSep(s, last, sep):
  1681. when sep is Rune:
  1682. inc(last, sepLen)
  1683. else:
  1684. inc(last, runeLenAt(s, last))
  1685. if splits == 0: last = len(s)
  1686. yield s[first .. (last - 1)]
  1687. if splits == 0: break
  1688. dec(splits)
  1689. when sep is Rune:
  1690. inc(last, sepLen)
  1691. else:
  1692. inc(last, if last < len(s): runeLenAt(s, last) else: 1)
  1693. iterator split*(s: string, seps: openarray[Rune] = unicodeSpaces,
  1694. maxsplit: int = -1): string =
  1695. ## Splits the unicode string `s` into substrings using a group of separators.
  1696. ##
  1697. ## Substrings are separated by a substring containing only `seps`.
  1698. ##
  1699. ## .. code-block:: nim
  1700. ## for word in split("this\lis an\texample"):
  1701. ## writeLine(stdout, word)
  1702. ##
  1703. ## ...generates this output:
  1704. ##
  1705. ## .. code-block::
  1706. ## "this"
  1707. ## "is"
  1708. ## "an"
  1709. ## "example"
  1710. ##
  1711. ## And the following code:
  1712. ##
  1713. ## .. code-block:: nim
  1714. ## for word in split("this:is;an$example", {';', ':', '$'}):
  1715. ## writeLine(stdout, word)
  1716. ##
  1717. ## ...produces the same output as the first example. The code:
  1718. ##
  1719. ## .. code-block:: nim
  1720. ## let date = "2012-11-20T22:08:08.398990"
  1721. ## let separators = {' ', '-', ':', 'T'}
  1722. ## for number in split(date, separators):
  1723. ## writeLine(stdout, number)
  1724. ##
  1725. ## ...results in:
  1726. ##
  1727. ## .. code-block::
  1728. ## "2012"
  1729. ## "11"
  1730. ## "20"
  1731. ## "22"
  1732. ## "08"
  1733. ## "08.398990"
  1734. ##
  1735. splitCommon(s, seps, maxsplit)
  1736. iterator splitWhitespace*(s: string): string =
  1737. ## Splits a unicode string at whitespace runes
  1738. splitCommon(s, unicodeSpaces, -1)
  1739. template accResult(iter: untyped) =
  1740. result = @[]
  1741. for x in iter: add(result, x)
  1742. proc splitWhitespace*(s: string): seq[string] {.noSideEffect,
  1743. rtl, extern: "ncuSplitWhitespace".} =
  1744. ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_
  1745. ## iterator, but is a proc that returns a sequence of substrings.
  1746. accResult(splitWhitespace(s))
  1747. iterator split*(s: string, sep: Rune, maxsplit: int = -1): string =
  1748. ## Splits the unicode string `s` into substrings using a single separator.
  1749. ##
  1750. ## Substrings are separated by the rune `sep`.
  1751. ## The code:
  1752. ##
  1753. ## .. code-block:: nim
  1754. ## for word in split(";;this;is;an;;example;;;", ';'):
  1755. ## writeLine(stdout, word)
  1756. ##
  1757. ## Results in:
  1758. ##
  1759. ## .. code-block::
  1760. ## ""
  1761. ## ""
  1762. ## "this"
  1763. ## "is"
  1764. ## "an"
  1765. ## ""
  1766. ## "example"
  1767. ## ""
  1768. ## ""
  1769. ## ""
  1770. ##
  1771. splitCommon(s, sep, maxsplit, sep.size)
  1772. proc split*(s: string, seps: openarray[Rune] = unicodeSpaces, maxsplit: int = -1): seq[string] {.
  1773. noSideEffect, rtl, extern: "nucSplitRunes".} =
  1774. ## The same as the `split iterator <#split.i,string,openarray[Rune]>`_, but is a
  1775. ## proc that returns a sequence of substrings.
  1776. accResult(split(s, seps, maxsplit))
  1777. proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect,
  1778. rtl, extern: "nucSplitRune".} =
  1779. ## The same as the `split iterator <#split.i,string,Rune>`_, but is a proc
  1780. ## that returns a sequence of substrings.
  1781. accResult(split(s, sep, maxsplit))
  1782. proc strip*(s: string, leading = true, trailing = true,
  1783. runes: openarray[Rune] = unicodeSpaces): string {.noSideEffect,
  1784. rtl, extern: "nucStrip".} =
  1785. ## Strips leading or trailing `runes` from `s` and returns
  1786. ## the resulting string.
  1787. ##
  1788. ## If `leading` is true, leading `runes` are stripped.
  1789. ## If `trailing` is true, trailing `runes` are stripped.
  1790. ## If both are false, the string is returned unchanged.
  1791. var
  1792. s_i = 0 ## starting index into string ``s``
  1793. e_i = len(s) - 1 ## ending index into ``s``, where the last ``Rune`` starts
  1794. if leading:
  1795. var
  1796. i = 0
  1797. l_i: int ## value of ``s_i`` at the beginning of the iteration
  1798. rune: Rune
  1799. while i < len(s):
  1800. l_i = i
  1801. fastRuneAt(s, i, rune)
  1802. s_i = i # Assume to start from next rune
  1803. if not runes.contains(rune):
  1804. s_i = l_i # Go back to where the current rune starts
  1805. break
  1806. if trailing:
  1807. var
  1808. i = e_i
  1809. l_i: int
  1810. rune: Rune
  1811. while i >= 0:
  1812. l_i = i
  1813. fastRuneAt(s, l_i, rune)
  1814. var p_i = i - 1
  1815. while p_i >= 0:
  1816. var
  1817. p_i_end = p_i
  1818. p_rune: Rune
  1819. fastRuneAt(s, p_i_end, p_rune)
  1820. if p_i_end < l_i: break
  1821. i = p_i
  1822. rune = p_rune
  1823. dec(p_i)
  1824. if not runes.contains(rune):
  1825. e_i = l_i - 1
  1826. break
  1827. dec(i)
  1828. let newLen = e_i - s_i + 1
  1829. result = newStringOfCap(newLen)
  1830. if newLen > 0:
  1831. result.add s[s_i .. e_i]
  1832. proc repeat*(c: Rune, count: Natural): string {.noSideEffect,
  1833. rtl, extern: "nucRepeatRune".} =
  1834. ## Returns a string of `count` Runes `c`.
  1835. ##
  1836. ## The returned string will have a rune-length of `count`.
  1837. let s = $c
  1838. result = newStringOfCap(count * s.len)
  1839. for i in 0 ..< count:
  1840. result.add s
  1841. proc align*(s: string, count: Natural, padding = ' '.Rune): string {.
  1842. noSideEffect, rtl, extern: "nucAlignString".} =
  1843. ## Aligns a unicode string `s` with `padding`, so that it has a rune-length
  1844. ## of `count`.
  1845. ##
  1846. ## `padding` characters (by default spaces) are added before `s` resulting in
  1847. ## right alignment. If ``s.runelen >= count``, no spaces are added and `s` is
  1848. ## returned unchanged. If you need to left align a string use the `alignLeft
  1849. ## proc <#alignLeft>`_.
  1850. runnableExamples:
  1851. assert align("abc", 4) == " abc"
  1852. assert align("a", 0) == "a"
  1853. assert align("1232", 6) == " 1232"
  1854. assert align("1232", 6, '#'.Rune) == "##1232"
  1855. assert align("Åge", 5) == " Åge"
  1856. assert align("×", 4, '_'.Rune) == "___×"
  1857. let sLen = s.runeLen
  1858. if sLen < count:
  1859. let padStr = $padding
  1860. result = newStringOfCap(padStr.len * count)
  1861. let spaces = count - sLen
  1862. for i in 0 ..< spaces: result.add padStr
  1863. result.add s
  1864. else:
  1865. result = s
  1866. proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {.
  1867. noSideEffect.} =
  1868. ## Left-Aligns a unicode string `s` with `padding`, so that it has a
  1869. ## rune-length of `count`.
  1870. ##
  1871. ## `padding` characters (by default spaces) are added after `s` resulting in
  1872. ## left alignment. If ``s.runelen >= count``, no spaces are added and `s` is
  1873. ## returned unchanged. If you need to right align a string use the `align
  1874. ## proc <#align>`_.
  1875. runnableExamples:
  1876. assert alignLeft("abc", 4) == "abc "
  1877. assert alignLeft("a", 0) == "a"
  1878. assert alignLeft("1232", 6) == "1232 "
  1879. assert alignLeft("1232", 6, '#'.Rune) == "1232##"
  1880. assert alignLeft("Åge", 5) == "Åge "
  1881. assert alignLeft("×", 4, '_'.Rune) == "×___"
  1882. let sLen = s.runeLen
  1883. if sLen < count:
  1884. let padStr = $padding
  1885. result = newStringOfCap(s.len + (count - sLen) * padStr.len)
  1886. result.add s
  1887. for i in sLen ..< count:
  1888. result.add padStr
  1889. else:
  1890. result = s
  1891. when isMainModule:
  1892. proc asRune(s: static[string]): Rune =
  1893. ## Compile-time conversion proc for converting string literals to a Rune
  1894. ## value. Returns the first Rune of the specified string.
  1895. ##
  1896. ## Shortcuts code like ``"å".runeAt(0)`` to ``"å".asRune`` and returns a
  1897. ## compile-time constant.
  1898. if s.len == 0: Rune(0)
  1899. else: s.runeAt(0)
  1900. let
  1901. someString = "öÑ"
  1902. someRunes = @[runeAt(someString, 0), runeAt(someString, 2)]
  1903. compared = (someString == $someRunes)
  1904. doAssert compared == true
  1905. proc test_replacements(word: string): string =
  1906. case word
  1907. of "two":
  1908. return "2"
  1909. of "foo":
  1910. return "BAR"
  1911. of "βeta":
  1912. return "beta"
  1913. of "alpha":
  1914. return "αlpha"
  1915. else:
  1916. return "12345"
  1917. doAssert translate("two not alpha foo βeta", test_replacements) == "2 12345 αlpha BAR beta"
  1918. doAssert translate(" two not foo βeta ", test_replacements) == " 2 12345 BAR beta "
  1919. doAssert title("foo bar") == "Foo Bar"
  1920. doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
  1921. doAssert title("") == ""
  1922. doAssert capitalize("βeta") == "Βeta"
  1923. doAssert capitalize("foo") == "Foo"
  1924. doAssert capitalize("") == ""
  1925. doAssert swapCase("FooBar") == "fOObAR"
  1926. doAssert swapCase(" ") == " "
  1927. doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
  1928. doAssert swapCase("a✓B") == "A✓b"
  1929. doAssert swapCase("") == ""
  1930. doAssert isAlpha("r")
  1931. doAssert isAlpha("α")
  1932. doAssert(not isAlpha("$"))
  1933. doAssert(not isAlpha(""))
  1934. doAssert isAlpha("Βeta")
  1935. doAssert isAlpha("Args")
  1936. doAssert(not isAlpha("$Foo✓"))
  1937. doAssert isSpace("\t")
  1938. doAssert isSpace("\l")
  1939. doAssert(not isSpace("Β"))
  1940. doAssert(not isSpace("Βeta"))
  1941. doAssert isSpace("\t\l \v\r\f")
  1942. doAssert isSpace(" ")
  1943. doAssert(not isSpace(""))
  1944. doAssert(not isSpace("ΑΓc \td"))
  1945. doAssert(not isLower(' '.Rune))
  1946. doAssert(not isUpper(' '.Rune))
  1947. doAssert toUpper("Γ") == "Γ"
  1948. doAssert toUpper("b") == "B"
  1949. doAssert toUpper("α") == "Α"
  1950. doAssert toUpper("✓") == "✓"
  1951. doAssert toUpper("") == ""
  1952. doAssert toUpper("ΑΒΓ") == "ΑΒΓ"
  1953. doAssert toUpper("AAccβ") == "AACCΒ"
  1954. doAssert toUpper("A✓$β") == "A✓$Β"
  1955. doAssert toLower("a") == "a"
  1956. doAssert toLower("γ") == "γ"
  1957. doAssert toLower("Γ") == "γ"
  1958. doAssert toLower("4") == "4"
  1959. doAssert toLower("") == ""
  1960. doAssert toLower("abcdγ") == "abcdγ"
  1961. doAssert toLower("abCDΓ") == "abcdγ"
  1962. doAssert toLower("33aaΓ") == "33aaγ"
  1963. doAssert reversed("Reverse this!") == "!siht esreveR"
  1964. doAssert reversed("先秦兩漢") == "漢兩秦先"
  1965. doAssert reversed("as⃝df̅") == "f̅ds⃝a"
  1966. doAssert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  1967. doAssert len(toRunes("as⃝df̅")) == runeLen("as⃝df̅")
  1968. const test = "as⃝"
  1969. doAssert lastRune(test, test.len-1)[1] == 3
  1970. doAssert graphemeLen("è", 0) == 2
  1971. # test for rune positioning and runeSubStr()
  1972. let s = "Hänsel ««: 10,00€"
  1973. var t = ""
  1974. for c in s.utf8:
  1975. t.add c
  1976. doAssert(s == t)
  1977. doAssert(runeReverseOffset(s, 1) == (20, 18))
  1978. doAssert(runeReverseOffset(s, 19) == (-1, 18))
  1979. doAssert(runeStrAtPos(s, 0) == "H")
  1980. doAssert(runeSubStr(s, 0, 1) == "H")
  1981. doAssert(runeStrAtPos(s, 10) == ":")
  1982. doAssert(runeSubStr(s, 10, 1) == ":")
  1983. doAssert(runeStrAtPos(s, 9) == "«")
  1984. doAssert(runeSubStr(s, 9, 1) == "«")
  1985. doAssert(runeStrAtPos(s, 17) == "€")
  1986. doAssert(runeSubStr(s, 17, 1) == "€")
  1987. # echo runeStrAtPos(s, 18) # index error
  1988. doAssert(runeSubStr(s, 0) == "Hänsel ««: 10,00€")
  1989. doAssert(runeSubStr(s, -18) == "Hänsel ««: 10,00€")
  1990. doAssert(runeSubStr(s, 10) == ": 10,00€")
  1991. doAssert(runeSubStr(s, 18) == "")
  1992. doAssert(runeSubStr(s, 0, 10) == "Hänsel ««")
  1993. doAssert(runeSubStr(s, 12) == "10,00€")
  1994. doAssert(runeSubStr(s, -6) == "10,00€")
  1995. doAssert(runeSubStr(s, 12, 5) == "10,00")
  1996. doAssert(runeSubStr(s, 12, -1) == "10,00")
  1997. doAssert(runeSubStr(s, -6, 5) == "10,00")
  1998. doAssert(runeSubStr(s, -6, -1) == "10,00")
  1999. doAssert(runeSubStr(s, 0, 100) == "Hänsel ««: 10,00€")
  2000. doAssert(runeSubStr(s, -100, 100) == "Hänsel ««: 10,00€")
  2001. doAssert(runeSubStr(s, 0, -100) == "")
  2002. doAssert(runeSubStr(s, 100, -100) == "")
  2003. block splitTests:
  2004. let s = " this is an example "
  2005. let s2 = ":this;is;an:example;;"
  2006. let s3 = ":this×is×an:example××"
  2007. doAssert s.split() == @["", "this", "is", "an", "example", "", ""]
  2008. doAssert s2.split(seps = [':'.Rune, ';'.Rune]) == @["", "this", "is", "an", "example", "", ""]
  2009. doAssert s3.split(seps = [':'.Rune, "×".asRune]) == @["", "this", "is", "an", "example", "", ""]
  2010. doAssert s.split(maxsplit = 4) == @["", "this", "is", "an", "example "]
  2011. doAssert s.split(' '.Rune, maxsplit = 1) == @["", "this is an example "]
  2012. block stripTests:
  2013. doAssert(strip("") == "")
  2014. doAssert(strip(" ") == "")
  2015. doAssert(strip("y") == "y")
  2016. doAssert(strip(" foofoofoo ") == "foofoofoo")
  2017. doAssert(strip("sfoofoofoos", runes = ['s'.Rune]) == "foofoofoo")
  2018. block:
  2019. let stripTestRunes = ['b'.Rune, 'a'.Rune, 'r'.Rune]
  2020. doAssert(strip("barfoofoofoobar", runes = stripTestRunes) == "foofoofoo")
  2021. doAssert(strip("sfoofoofoos", leading = false, runes = ['s'.Rune]) == "sfoofoofoo")
  2022. doAssert(strip("sfoofoofoos", trailing = false, runes = ['s'.Rune]) == "foofoofoos")
  2023. block:
  2024. let stripTestRunes = ["«".asRune, "»".asRune]
  2025. doAssert(strip("«TEXT»", runes = stripTestRunes) == "TEXT")
  2026. doAssert(strip("copyright©", leading = false, runes = ["©".asRune]) == "copyright")
  2027. doAssert(strip("¿Question?", trailing = false, runes = ["¿".asRune]) == "Question?")
  2028. doAssert(strip("×text×", leading = false, runes = ["×".asRune]) == "×text")
  2029. doAssert(strip("×text×", trailing = false, runes = ["×".asRune]) == "text×")
  2030. block repeatTests:
  2031. doAssert repeat('c'.Rune, 5) == "ccccc"
  2032. doAssert repeat("×".asRune, 5) == "×××××"
  2033. block alignTests:
  2034. doAssert align("abc", 4) == " abc"
  2035. doAssert align("a", 0) == "a"
  2036. doAssert align("1232", 6) == " 1232"
  2037. doAssert align("1232", 6, '#'.Rune) == "##1232"
  2038. doAssert align("1232", 6, "×".asRune) == "××1232"
  2039. doAssert alignLeft("abc", 4) == "abc "
  2040. doAssert alignLeft("a", 0) == "a"
  2041. doAssert alignLeft("1232", 6) == "1232 "
  2042. doAssert alignLeft("1232", 6, '#'.Rune) == "1232##"
  2043. doAssert alignLeft("1232", 6, "×".asRune) == "1232××"