unicode.nim 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module provides support to handle the Unicode UTF-8 encoding.
  10. {.deadCodeElim: on.}
  11. include "system/inclrtl"
  12. type
  13. RuneImpl = int32 # underlying type of Rune
  14. Rune* = distinct RuneImpl ## type that can hold any Unicode character
  15. Rune16* = distinct int16 ## 16 bit Unicode character
  16. {.deprecated: [TRune: Rune, TRune16: Rune16].}
  17. proc `<=%`*(a, b: Rune): bool = return int(a) <=% int(b)
  18. proc `<%`*(a, b: Rune): bool = return int(a) <% int(b)
  19. proc `==`*(a, b: Rune): bool = return int(a) == int(b)
  20. template ones(n: untyped): untyped = ((1 shl n)-1)
  21. proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} =
  22. ## Returns the number of Unicode characters of the string ``s``
  23. var i = 0
  24. while i < len(s):
  25. if ord(s[i]) <=% 127: inc(i)
  26. elif ord(s[i]) shr 5 == 0b110: inc(i, 2)
  27. elif ord(s[i]) shr 4 == 0b1110: inc(i, 3)
  28. elif ord(s[i]) shr 3 == 0b11110: inc(i, 4)
  29. elif ord(s[i]) shr 2 == 0b111110: inc(i, 5)
  30. elif ord(s[i]) shr 1 == 0b1111110: inc(i, 6)
  31. else: inc i
  32. inc(result)
  33. proc runeLenAt*(s: string, i: Natural): int =
  34. ## Returns the number of bytes the rune starting at ``s[i]`` takes
  35. if ord(s[i]) <=% 127: result = 1
  36. elif ord(s[i]) shr 5 == 0b110: result = 2
  37. elif ord(s[i]) shr 4 == 0b1110: result = 3
  38. elif ord(s[i]) shr 3 == 0b11110: result = 4
  39. elif ord(s[i]) shr 2 == 0b111110: result = 5
  40. elif ord(s[i]) shr 1 == 0b1111110: result = 6
  41. else: result = 1
  42. const replRune = Rune(0xFFFD)
  43. template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) =
  44. ## Returns the Unicode character ``s[i]`` in ``result``. If ``doInc == true``
  45. ## ``i`` is incremented by the number of bytes that have been processed.
  46. bind ones
  47. if ord(s[i]) <=% 127:
  48. result = Rune(ord(s[i]))
  49. when doInc: inc(i)
  50. elif ord(s[i]) shr 5 == 0b110:
  51. # assert(ord(s[i+1]) shr 6 == 0b10)
  52. if i <= s.len - 2:
  53. result = Rune((ord(s[i]) and (ones(5))) shl 6 or
  54. (ord(s[i+1]) and ones(6)))
  55. when doInc: inc(i, 2)
  56. else:
  57. result = replRune
  58. when doInc: inc(i)
  59. elif ord(s[i]) shr 4 == 0b1110:
  60. # assert(ord(s[i+1]) shr 6 == 0b10)
  61. # assert(ord(s[i+2]) shr 6 == 0b10)
  62. if i <= s.len - 3:
  63. result = Rune((ord(s[i]) and ones(4)) shl 12 or
  64. (ord(s[i+1]) and ones(6)) shl 6 or
  65. (ord(s[i+2]) and ones(6)))
  66. when doInc: inc(i, 3)
  67. else:
  68. result = replRune
  69. when doInc: inc(i)
  70. elif ord(s[i]) shr 3 == 0b11110:
  71. # assert(ord(s[i+1]) shr 6 == 0b10)
  72. # assert(ord(s[i+2]) shr 6 == 0b10)
  73. # assert(ord(s[i+3]) shr 6 == 0b10)
  74. if i <= s.len - 4:
  75. result = Rune((ord(s[i]) and ones(3)) shl 18 or
  76. (ord(s[i+1]) and ones(6)) shl 12 or
  77. (ord(s[i+2]) and ones(6)) shl 6 or
  78. (ord(s[i+3]) and ones(6)))
  79. when doInc: inc(i, 4)
  80. else:
  81. result = replRune
  82. when doInc: inc(i)
  83. elif ord(s[i]) shr 2 == 0b111110:
  84. # assert(ord(s[i+1]) shr 6 == 0b10)
  85. # assert(ord(s[i+2]) shr 6 == 0b10)
  86. # assert(ord(s[i+3]) shr 6 == 0b10)
  87. # assert(ord(s[i+4]) shr 6 == 0b10)
  88. if i <= s.len - 5:
  89. result = Rune((ord(s[i]) and ones(2)) shl 24 or
  90. (ord(s[i+1]) and ones(6)) shl 18 or
  91. (ord(s[i+2]) and ones(6)) shl 12 or
  92. (ord(s[i+3]) and ones(6)) shl 6 or
  93. (ord(s[i+4]) and ones(6)))
  94. when doInc: inc(i, 5)
  95. else:
  96. result = replRune
  97. when doInc: inc(i)
  98. elif ord(s[i]) shr 1 == 0b1111110:
  99. # assert(ord(s[i+1]) shr 6 == 0b10)
  100. # assert(ord(s[i+2]) shr 6 == 0b10)
  101. # assert(ord(s[i+3]) shr 6 == 0b10)
  102. # assert(ord(s[i+4]) shr 6 == 0b10)
  103. # assert(ord(s[i+5]) shr 6 == 0b10)
  104. if i <= s.len - 6:
  105. result = Rune((ord(s[i]) and ones(1)) shl 30 or
  106. (ord(s[i+1]) and ones(6)) shl 24 or
  107. (ord(s[i+2]) and ones(6)) shl 18 or
  108. (ord(s[i+3]) and ones(6)) shl 12 or
  109. (ord(s[i+4]) and ones(6)) shl 6 or
  110. (ord(s[i+5]) and ones(6)))
  111. when doInc: inc(i, 6)
  112. else:
  113. result = replRune
  114. when doInc: inc(i)
  115. else:
  116. result = Rune(ord(s[i]))
  117. when doInc: inc(i)
  118. proc validateUtf8*(s: string): int =
  119. ## Returns the position of the invalid byte in ``s`` if the string ``s`` does
  120. ## not hold valid UTF-8 data. Otherwise ``-1`` is returned.
  121. var i = 0
  122. let L = s.len
  123. while i < L:
  124. if ord(s[i]) <=% 127:
  125. inc(i)
  126. elif ord(s[i]) shr 5 == 0b110:
  127. if ord(s[i]) < 0xc2: return i # Catch overlong ascii representations.
  128. if i+1 < L and ord(s[i+1]) shr 6 == 0b10: inc(i, 2)
  129. else: return i
  130. elif ord(s[i]) shr 4 == 0b1110:
  131. if i+2 < L and ord(s[i+1]) shr 6 == 0b10 and ord(s[i+2]) shr 6 == 0b10:
  132. inc i, 3
  133. else: return i
  134. elif ord(s[i]) shr 3 == 0b11110:
  135. if i+3 < L and ord(s[i+1]) shr 6 == 0b10 and
  136. ord(s[i+2]) shr 6 == 0b10 and
  137. ord(s[i+3]) shr 6 == 0b10:
  138. inc i, 4
  139. else: return i
  140. else:
  141. return i
  142. return -1
  143. proc runeAt*(s: string, i: Natural): Rune =
  144. ## Returns the unicode character in ``s`` at byte index ``i``
  145. fastRuneAt(s, i, result, false)
  146. template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) =
  147. ## Copies UTF-8 representation of `c` into the preallocated string `s`
  148. ## starting at position `pos`. If `doInc == true`, `pos` is incremented
  149. ## by the number of bytes that have been processed.
  150. ##
  151. ## To be the most efficient, make sure `s` is preallocated
  152. ## with an additional amount equal to the byte length of
  153. ## `c`.
  154. var i = RuneImpl(c)
  155. if i <=% 127:
  156. s.setLen(pos+1)
  157. s[pos+0] = chr(i)
  158. when doInc: inc(pos)
  159. elif i <=% 0x07FF:
  160. s.setLen(pos+2)
  161. s[pos+0] = chr((i shr 6) or 0b110_00000)
  162. s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
  163. when doInc: inc(pos, 2)
  164. elif i <=% 0xFFFF:
  165. s.setLen(pos+3)
  166. s[pos+0] = chr(i shr 12 or 0b1110_0000)
  167. s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  168. s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
  169. when doInc: inc(pos, 3)
  170. elif i <=% 0x001FFFFF:
  171. s.setLen(pos+4)
  172. s[pos+0] = chr(i shr 18 or 0b1111_0000)
  173. s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  174. s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  175. s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
  176. when doInc: inc(pos, 4)
  177. elif i <=% 0x03FFFFFF:
  178. s.setLen(pos+5)
  179. s[pos+0] = chr(i shr 24 or 0b111110_00)
  180. s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  181. s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  182. s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  183. s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
  184. when doInc: inc(pos, 5)
  185. elif i <=% 0x7FFFFFFF:
  186. s.setLen(pos+6)
  187. s[pos+0] = chr(i shr 30 or 0b1111110_0)
  188. s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
  189. s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  190. s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  191. s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  192. s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
  193. when doInc: inc(pos, 6)
  194. else:
  195. discard # error, exception?
  196. proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} =
  197. ## Converts a rune into its UTF-8 representation
  198. result = ""
  199. fastToUTF8Copy(c, result, 0, false)
  200. proc `$`*(rune: Rune): string =
  201. ## Converts a Rune to a string
  202. rune.toUTF8
  203. proc `$`*(runes: seq[Rune]): string =
  204. ## Converts a sequence of Runes to a string
  205. result = ""
  206. for rune in runes: result.add(rune.toUTF8)
  207. proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int =
  208. ## Returns the byte position of unicode character
  209. ## at position pos in s with an optional start byte position.
  210. ## returns the special value -1 if it runs out of the string
  211. ##
  212. ## Beware: This can lead to unoptimized code and slow execution!
  213. ## Most problems are solve more efficient by using an iterator
  214. ## or conversion to a seq of Rune.
  215. var
  216. i = 0
  217. o = start
  218. while i < pos:
  219. o += runeLenAt(s, o)
  220. if o >= s.len:
  221. return -1
  222. inc i
  223. return o
  224. proc runeAtPos*(s: string, pos: int): Rune =
  225. ## Returns the unicode character at position pos
  226. ##
  227. ## Beware: This can lead to unoptimized code and slow execution!
  228. ## Most problems are solve more efficient by using an iterator
  229. ## or conversion to a seq of Rune.
  230. fastRuneAt(s, runeOffset(s, pos), result, false)
  231. proc runeStrAtPos*(s: string, pos: Natural): string =
  232. ## Returns the unicode character at position pos as UTF8 String
  233. ##
  234. ## Beware: This can lead to unoptimized code and slow execution!
  235. ## Most problems are solve more efficient by using an iterator
  236. ## or conversion to a seq of Rune.
  237. let o = runeOffset(s, pos)
  238. s[o.. (o+runeLenAt(s, o)-1)]
  239. proc runeReverseOffset*(s: string, rev:Positive): (int, int) =
  240. ## Returns a tuple with the the byte offset of the
  241. ## unicode character at position ``rev`` in s counting
  242. ## from the end (starting with 1) and the total
  243. ## number of runes in the string. Returns a negative value
  244. ## for offset if there are to few runes in the string to
  245. ## satisfy the request.
  246. ##
  247. ## Beware: This can lead to unoptimized code and slow execution!
  248. ## Most problems are solve more efficient by using an iterator
  249. ## or conversion to a seq of Rune.
  250. var
  251. a = rev.int
  252. o = 0
  253. x = 0
  254. while o < s.len:
  255. let r = runeLenAt(s, o)
  256. o += r
  257. if a < 0:
  258. x += r
  259. dec a
  260. if a > 0:
  261. return (-a, rev.int-a)
  262. return (x, -a+rev.int)
  263. proc runeSubStr*(s: string, pos:int, len:int = int.high): string =
  264. ## Returns the UTF-8 substring starting at codepoint pos
  265. ## with len codepoints. If pos or len is negativ they count from
  266. ## the end of the string. If len is not given it means the longest
  267. ## possible string.
  268. ##
  269. ## (Needs some examples)
  270. if pos < 0:
  271. let (o, rl) = runeReverseOffset(s, -pos)
  272. if len >= rl:
  273. result = s[o.. s.len-1]
  274. elif len < 0:
  275. let e = rl + len
  276. if e < 0:
  277. result = ""
  278. else:
  279. result = s[o.. runeOffset(s, e-(rl+pos) , o)-1]
  280. else:
  281. result = s[o.. runeOffset(s, len, o)-1]
  282. else:
  283. let o = runeOffset(s, pos)
  284. if o < 0:
  285. result = ""
  286. elif len == int.high:
  287. result = s[o.. s.len-1]
  288. elif len < 0:
  289. let (e, rl) = runeReverseOffset(s, -len)
  290. discard rl
  291. if e <= 0:
  292. result = ""
  293. else:
  294. result = s[o.. e-1]
  295. else:
  296. var e = runeOffset(s, len, o)
  297. if e < 0:
  298. e = s.len
  299. result = s[o.. e-1]
  300. const
  301. alphaRanges = [
  302. 0x00d8, 0x00f6, # -
  303. 0x00f8, 0x01f5, # -
  304. 0x0250, 0x02a8, # -
  305. 0x038e, 0x03a1, # -
  306. 0x03a3, 0x03ce, # -
  307. 0x03d0, 0x03d6, # -
  308. 0x03e2, 0x03f3, # -
  309. 0x0490, 0x04c4, # -
  310. 0x0561, 0x0587, # -
  311. 0x05d0, 0x05ea, # -
  312. 0x05f0, 0x05f2, # -
  313. 0x0621, 0x063a, # -
  314. 0x0640, 0x064a, # -
  315. 0x0671, 0x06b7, # -
  316. 0x06ba, 0x06be, # -
  317. 0x06c0, 0x06ce, # -
  318. 0x06d0, 0x06d3, # -
  319. 0x0905, 0x0939, # -
  320. 0x0958, 0x0961, # -
  321. 0x0985, 0x098c, # -
  322. 0x098f, 0x0990, # -
  323. 0x0993, 0x09a8, # -
  324. 0x09aa, 0x09b0, # -
  325. 0x09b6, 0x09b9, # -
  326. 0x09dc, 0x09dd, # -
  327. 0x09df, 0x09e1, # -
  328. 0x09f0, 0x09f1, # -
  329. 0x0a05, 0x0a0a, # -
  330. 0x0a0f, 0x0a10, # -
  331. 0x0a13, 0x0a28, # -
  332. 0x0a2a, 0x0a30, # -
  333. 0x0a32, 0x0a33, # -
  334. 0x0a35, 0x0a36, # -
  335. 0x0a38, 0x0a39, # -
  336. 0x0a59, 0x0a5c, # -
  337. 0x0a85, 0x0a8b, # -
  338. 0x0a8f, 0x0a91, # -
  339. 0x0a93, 0x0aa8, # -
  340. 0x0aaa, 0x0ab0, # -
  341. 0x0ab2, 0x0ab3, # -
  342. 0x0ab5, 0x0ab9, # -
  343. 0x0b05, 0x0b0c, # -
  344. 0x0b0f, 0x0b10, # -
  345. 0x0b13, 0x0b28, # -
  346. 0x0b2a, 0x0b30, # -
  347. 0x0b32, 0x0b33, # -
  348. 0x0b36, 0x0b39, # -
  349. 0x0b5c, 0x0b5d, # -
  350. 0x0b5f, 0x0b61, # -
  351. 0x0b85, 0x0b8a, # -
  352. 0x0b8e, 0x0b90, # -
  353. 0x0b92, 0x0b95, # -
  354. 0x0b99, 0x0b9a, # -
  355. 0x0b9e, 0x0b9f, # -
  356. 0x0ba3, 0x0ba4, # -
  357. 0x0ba8, 0x0baa, # -
  358. 0x0bae, 0x0bb5, # -
  359. 0x0bb7, 0x0bb9, # -
  360. 0x0c05, 0x0c0c, # -
  361. 0x0c0e, 0x0c10, # -
  362. 0x0c12, 0x0c28, # -
  363. 0x0c2a, 0x0c33, # -
  364. 0x0c35, 0x0c39, # -
  365. 0x0c60, 0x0c61, # -
  366. 0x0c85, 0x0c8c, # -
  367. 0x0c8e, 0x0c90, # -
  368. 0x0c92, 0x0ca8, # -
  369. 0x0caa, 0x0cb3, # -
  370. 0x0cb5, 0x0cb9, # -
  371. 0x0ce0, 0x0ce1, # -
  372. 0x0d05, 0x0d0c, # -
  373. 0x0d0e, 0x0d10, # -
  374. 0x0d12, 0x0d28, # -
  375. 0x0d2a, 0x0d39, # -
  376. 0x0d60, 0x0d61, # -
  377. 0x0e01, 0x0e30, # -
  378. 0x0e32, 0x0e33, # -
  379. 0x0e40, 0x0e46, # -
  380. 0x0e5a, 0x0e5b, # -
  381. 0x0e81, 0x0e82, # -
  382. 0x0e87, 0x0e88, # -
  383. 0x0e94, 0x0e97, # -
  384. 0x0e99, 0x0e9f, # -
  385. 0x0ea1, 0x0ea3, # -
  386. 0x0eaa, 0x0eab, # -
  387. 0x0ead, 0x0eae, # -
  388. 0x0eb2, 0x0eb3, # -
  389. 0x0ec0, 0x0ec4, # -
  390. 0x0edc, 0x0edd, # -
  391. 0x0f18, 0x0f19, # -
  392. 0x0f40, 0x0f47, # -
  393. 0x0f49, 0x0f69, # -
  394. 0x10d0, 0x10f6, # -
  395. 0x1100, 0x1159, # -
  396. 0x115f, 0x11a2, # -
  397. 0x11a8, 0x11f9, # -
  398. 0x1e00, 0x1e9b, # -
  399. 0x1f50, 0x1f57, # -
  400. 0x1f80, 0x1fb4, # -
  401. 0x1fb6, 0x1fbc, # -
  402. 0x1fc2, 0x1fc4, # -
  403. 0x1fc6, 0x1fcc, # -
  404. 0x1fd0, 0x1fd3, # -
  405. 0x1fd6, 0x1fdb, # -
  406. 0x1fe0, 0x1fec, # -
  407. 0x1ff2, 0x1ff4, # -
  408. 0x1ff6, 0x1ffc, # -
  409. 0x210a, 0x2113, # -
  410. 0x2115, 0x211d, # -
  411. 0x2120, 0x2122, # -
  412. 0x212a, 0x2131, # -
  413. 0x2133, 0x2138, # -
  414. 0x3041, 0x3094, # -
  415. 0x30a1, 0x30fa, # -
  416. 0x3105, 0x312c, # -
  417. 0x3131, 0x318e, # -
  418. 0x3192, 0x319f, # -
  419. 0x3260, 0x327b, # -
  420. 0x328a, 0x32b0, # -
  421. 0x32d0, 0x32fe, # -
  422. 0x3300, 0x3357, # -
  423. 0x3371, 0x3376, # -
  424. 0x337b, 0x3394, # -
  425. 0x3399, 0x339e, # -
  426. 0x33a9, 0x33ad, # -
  427. 0x33b0, 0x33c1, # -
  428. 0x33c3, 0x33c5, # -
  429. 0x33c7, 0x33d7, # -
  430. 0x33d9, 0x33dd, # -
  431. 0x4e00, 0x9fff, # -
  432. 0xac00, 0xd7a3, # -
  433. 0xf900, 0xfb06, # -
  434. 0xfb13, 0xfb17, # -
  435. 0xfb1f, 0xfb28, # -
  436. 0xfb2a, 0xfb36, # -
  437. 0xfb38, 0xfb3c, # -
  438. 0xfb40, 0xfb41, # -
  439. 0xfb43, 0xfb44, # -
  440. 0xfb46, 0xfbb1, # -
  441. 0xfbd3, 0xfd3d, # -
  442. 0xfd50, 0xfd8f, # -
  443. 0xfd92, 0xfdc7, # -
  444. 0xfdf0, 0xfdf9, # -
  445. 0xfe70, 0xfe72, # -
  446. 0xfe76, 0xfefc, # -
  447. 0xff66, 0xff6f, # -
  448. 0xff71, 0xff9d, # -
  449. 0xffa0, 0xffbe, # -
  450. 0xffc2, 0xffc7, # -
  451. 0xffca, 0xffcf, # -
  452. 0xffd2, 0xffd7, # -
  453. 0xffda, 0xffdc] # -
  454. alphaSinglets = [
  455. 0x00aa, #
  456. 0x00b5, #
  457. 0x00ba, #
  458. 0x03da, #
  459. 0x03dc, #
  460. 0x03de, #
  461. 0x03e0, #
  462. 0x06d5, #
  463. 0x09b2, #
  464. 0x0a5e, #
  465. 0x0a8d, #
  466. 0x0ae0, #
  467. 0x0b9c, #
  468. 0x0cde, #
  469. 0x0e4f, #
  470. 0x0e84, #
  471. 0x0e8a, #
  472. 0x0e8d, #
  473. 0x0ea5, #
  474. 0x0ea7, #
  475. 0x0eb0, #
  476. 0x0ebd, #
  477. 0x1fbe, #
  478. 0x207f, #
  479. 0x20a8, #
  480. 0x2102, #
  481. 0x2107, #
  482. 0x2124, #
  483. 0x2126, #
  484. 0x2128, #
  485. 0xfb3e, #
  486. 0xfe74] #
  487. spaceRanges = [
  488. 0x0009, 0x000d, # tab and newline
  489. 0x0020, 0x0020, # space
  490. 0x0085, 0x0085, # next line
  491. 0x00a0, 0x00a0, #
  492. 0x1680, 0x1680, # Ogham space mark
  493. 0x2000, 0x200b, # en dash .. zero-width space
  494. 0x200e, 0x200f, # LTR mark .. RTL mark (pattern whitespace)
  495. 0x2028, 0x2029, # - 0x3000, 0x3000, #
  496. 0x202f, 0x202f, # narrow no-break space
  497. 0x205f, 0x205f, # medium mathematical space
  498. 0x3000, 0x3000, # ideographic space
  499. 0xfeff, 0xfeff] #
  500. toupperRanges = [
  501. 0x0061, 0x007a, 468, # a-z A-Z
  502. 0x00e0, 0x00f6, 468, # - -
  503. 0x00f8, 0x00fe, 468, # - -
  504. 0x0256, 0x0257, 295, # - -
  505. 0x0258, 0x0259, 298, # - -
  506. 0x028a, 0x028b, 283, # - -
  507. 0x03ad, 0x03af, 463, # - -
  508. 0x03b1, 0x03c1, 468, # - -
  509. 0x03c3, 0x03cb, 468, # - -
  510. 0x03cd, 0x03ce, 437, # - -
  511. 0x0430, 0x044f, 468, # - -
  512. 0x0451, 0x045c, 420, # - -
  513. 0x045e, 0x045f, 420, # - -
  514. 0x0561, 0x0586, 452, # - -
  515. 0x1f00, 0x1f07, 508, # - -
  516. 0x1f10, 0x1f15, 508, # - -
  517. 0x1f20, 0x1f27, 508, # - -
  518. 0x1f30, 0x1f37, 508, # - -
  519. 0x1f40, 0x1f45, 508, # - -
  520. 0x1f60, 0x1f67, 508, # - -
  521. 0x1f70, 0x1f71, 574, # - -
  522. 0x1f72, 0x1f75, 586, # - -
  523. 0x1f76, 0x1f77, 600, # - -
  524. 0x1f78, 0x1f79, 628, # - -
  525. 0x1f7a, 0x1f7b, 612, # - -
  526. 0x1f7c, 0x1f7d, 626, # - -
  527. 0x1f80, 0x1f87, 508, # - -
  528. 0x1f90, 0x1f97, 508, # - -
  529. 0x1fa0, 0x1fa7, 508, # - -
  530. 0x1fb0, 0x1fb1, 508, # - -
  531. 0x1fd0, 0x1fd1, 508, # - -
  532. 0x1fe0, 0x1fe1, 508, # - -
  533. 0x2170, 0x217f, 484, # - -
  534. 0x24d0, 0x24e9, 474, # - -
  535. 0xff41, 0xff5a, 468] # - -
  536. toupperSinglets = [
  537. 0x00ff, 621, #
  538. 0x0101, 499, #
  539. 0x0103, 499, #
  540. 0x0105, 499, #
  541. 0x0107, 499, #
  542. 0x0109, 499, #
  543. 0x010b, 499, #
  544. 0x010d, 499, #
  545. 0x010f, 499, #
  546. 0x0111, 499, #
  547. 0x0113, 499, #
  548. 0x0115, 499, #
  549. 0x0117, 499, #
  550. 0x0119, 499, #
  551. 0x011b, 499, #
  552. 0x011d, 499, #
  553. 0x011f, 499, #
  554. 0x0121, 499, #
  555. 0x0123, 499, #
  556. 0x0125, 499, #
  557. 0x0127, 499, #
  558. 0x0129, 499, #
  559. 0x012b, 499, #
  560. 0x012d, 499, #
  561. 0x012f, 499, #
  562. 0x0131, 268, # I
  563. 0x0133, 499, #
  564. 0x0135, 499, #
  565. 0x0137, 499, #
  566. 0x013a, 499, #
  567. 0x013c, 499, #
  568. 0x013e, 499, #
  569. 0x0140, 499, #
  570. 0x0142, 499, #
  571. 0x0144, 499, #
  572. 0x0146, 499, #
  573. 0x0148, 499, #
  574. 0x014b, 499, #
  575. 0x014d, 499, #
  576. 0x014f, 499, #
  577. 0x0151, 499, #
  578. 0x0153, 499, #
  579. 0x0155, 499, #
  580. 0x0157, 499, #
  581. 0x0159, 499, #
  582. 0x015b, 499, #
  583. 0x015d, 499, #
  584. 0x015f, 499, #
  585. 0x0161, 499, #
  586. 0x0163, 499, #
  587. 0x0165, 499, #
  588. 0x0167, 499, #
  589. 0x0169, 499, #
  590. 0x016b, 499, #
  591. 0x016d, 499, #
  592. 0x016f, 499, #
  593. 0x0171, 499, #
  594. 0x0173, 499, #
  595. 0x0175, 499, #
  596. 0x0177, 499, #
  597. 0x017a, 499, #
  598. 0x017c, 499, #
  599. 0x017e, 499, #
  600. 0x017f, 200, # S
  601. 0x0183, 499, #
  602. 0x0185, 499, #
  603. 0x0188, 499, #
  604. 0x018c, 499, #
  605. 0x0192, 499, #
  606. 0x0199, 499, #
  607. 0x01a1, 499, #
  608. 0x01a3, 499, #
  609. 0x01a5, 499, #
  610. 0x01a8, 499, #
  611. 0x01ad, 499, #
  612. 0x01b0, 499, #
  613. 0x01b4, 499, #
  614. 0x01b6, 499, #
  615. 0x01b9, 499, #
  616. 0x01bd, 499, #
  617. 0x01c5, 499, #
  618. 0x01c6, 498, #
  619. 0x01c8, 499, #
  620. 0x01c9, 498, #
  621. 0x01cb, 499, #
  622. 0x01cc, 498, #
  623. 0x01ce, 499, #
  624. 0x01d0, 499, #
  625. 0x01d2, 499, #
  626. 0x01d4, 499, #
  627. 0x01d6, 499, #
  628. 0x01d8, 499, #
  629. 0x01da, 499, #
  630. 0x01dc, 499, #
  631. 0x01df, 499, #
  632. 0x01e1, 499, #
  633. 0x01e3, 499, #
  634. 0x01e5, 499, #
  635. 0x01e7, 499, #
  636. 0x01e9, 499, #
  637. 0x01eb, 499, #
  638. 0x01ed, 499, #
  639. 0x01ef, 499, #
  640. 0x01f2, 499, #
  641. 0x01f3, 498, #
  642. 0x01f5, 499, #
  643. 0x01fb, 499, #
  644. 0x01fd, 499, #
  645. 0x01ff, 499, #
  646. 0x0201, 499, #
  647. 0x0203, 499, #
  648. 0x0205, 499, #
  649. 0x0207, 499, #
  650. 0x0209, 499, #
  651. 0x020b, 499, #
  652. 0x020d, 499, #
  653. 0x020f, 499, #
  654. 0x0211, 499, #
  655. 0x0213, 499, #
  656. 0x0215, 499, #
  657. 0x0217, 499, #
  658. 0x0253, 290, #
  659. 0x0254, 294, #
  660. 0x025b, 297, #
  661. 0x0260, 295, #
  662. 0x0263, 293, #
  663. 0x0268, 291, #
  664. 0x0269, 289, #
  665. 0x026f, 289, #
  666. 0x0272, 287, #
  667. 0x0283, 282, #
  668. 0x0288, 282, #
  669. 0x0292, 281, #
  670. 0x03ac, 462, #
  671. 0x03cc, 436, #
  672. 0x03d0, 438, #
  673. 0x03d1, 443, #
  674. 0x03d5, 453, #
  675. 0x03d6, 446, #
  676. 0x03e3, 499, #
  677. 0x03e5, 499, #
  678. 0x03e7, 499, #
  679. 0x03e9, 499, #
  680. 0x03eb, 499, #
  681. 0x03ed, 499, #
  682. 0x03ef, 499, #
  683. 0x03f0, 414, #
  684. 0x03f1, 420, #
  685. 0x0461, 499, #
  686. 0x0463, 499, #
  687. 0x0465, 499, #
  688. 0x0467, 499, #
  689. 0x0469, 499, #
  690. 0x046b, 499, #
  691. 0x046d, 499, #
  692. 0x046f, 499, #
  693. 0x0471, 499, #
  694. 0x0473, 499, #
  695. 0x0475, 499, #
  696. 0x0477, 499, #
  697. 0x0479, 499, #
  698. 0x047b, 499, #
  699. 0x047d, 499, #
  700. 0x047f, 499, #
  701. 0x0481, 499, #
  702. 0x0491, 499, #
  703. 0x0493, 499, #
  704. 0x0495, 499, #
  705. 0x0497, 499, #
  706. 0x0499, 499, #
  707. 0x049b, 499, #
  708. 0x049d, 499, #
  709. 0x049f, 499, #
  710. 0x04a1, 499, #
  711. 0x04a3, 499, #
  712. 0x04a5, 499, #
  713. 0x04a7, 499, #
  714. 0x04a9, 499, #
  715. 0x04ab, 499, #
  716. 0x04ad, 499, #
  717. 0x04af, 499, #
  718. 0x04b1, 499, #
  719. 0x04b3, 499, #
  720. 0x04b5, 499, #
  721. 0x04b7, 499, #
  722. 0x04b9, 499, #
  723. 0x04bb, 499, #
  724. 0x04bd, 499, #
  725. 0x04bf, 499, #
  726. 0x04c2, 499, #
  727. 0x04c4, 499, #
  728. 0x04c8, 499, #
  729. 0x04cc, 499, #
  730. 0x04d1, 499, #
  731. 0x04d3, 499, #
  732. 0x04d5, 499, #
  733. 0x04d7, 499, #
  734. 0x04d9, 499, #
  735. 0x04db, 499, #
  736. 0x04dd, 499, #
  737. 0x04df, 499, #
  738. 0x04e1, 499, #
  739. 0x04e3, 499, #
  740. 0x04e5, 499, #
  741. 0x04e7, 499, #
  742. 0x04e9, 499, #
  743. 0x04eb, 499, #
  744. 0x04ef, 499, #
  745. 0x04f1, 499, #
  746. 0x04f3, 499, #
  747. 0x04f5, 499, #
  748. 0x04f9, 499, #
  749. 0x1e01, 499, #
  750. 0x1e03, 499, #
  751. 0x1e05, 499, #
  752. 0x1e07, 499, #
  753. 0x1e09, 499, #
  754. 0x1e0b, 499, #
  755. 0x1e0d, 499, #
  756. 0x1e0f, 499, #
  757. 0x1e11, 499, #
  758. 0x1e13, 499, #
  759. 0x1e15, 499, #
  760. 0x1e17, 499, #
  761. 0x1e19, 499, #
  762. 0x1e1b, 499, #
  763. 0x1e1d, 499, #
  764. 0x1e1f, 499, #
  765. 0x1e21, 499, #
  766. 0x1e23, 499, #
  767. 0x1e25, 499, #
  768. 0x1e27, 499, #
  769. 0x1e29, 499, #
  770. 0x1e2b, 499, #
  771. 0x1e2d, 499, #
  772. 0x1e2f, 499, #
  773. 0x1e31, 499, #
  774. 0x1e33, 499, #
  775. 0x1e35, 499, #
  776. 0x1e37, 499, #
  777. 0x1e39, 499, #
  778. 0x1e3b, 499, #
  779. 0x1e3d, 499, #
  780. 0x1e3f, 499, #
  781. 0x1e41, 499, #
  782. 0x1e43, 499, #
  783. 0x1e45, 499, #
  784. 0x1e47, 499, #
  785. 0x1e49, 499, #
  786. 0x1e4b, 499, #
  787. 0x1e4d, 499, #
  788. 0x1e4f, 499, #
  789. 0x1e51, 499, #
  790. 0x1e53, 499, #
  791. 0x1e55, 499, #
  792. 0x1e57, 499, #
  793. 0x1e59, 499, #
  794. 0x1e5b, 499, #
  795. 0x1e5d, 499, #
  796. 0x1e5f, 499, #
  797. 0x1e61, 499, #
  798. 0x1e63, 499, #
  799. 0x1e65, 499, #
  800. 0x1e67, 499, #
  801. 0x1e69, 499, #
  802. 0x1e6b, 499, #
  803. 0x1e6d, 499, #
  804. 0x1e6f, 499, #
  805. 0x1e71, 499, #
  806. 0x1e73, 499, #
  807. 0x1e75, 499, #
  808. 0x1e77, 499, #
  809. 0x1e79, 499, #
  810. 0x1e7b, 499, #
  811. 0x1e7d, 499, #
  812. 0x1e7f, 499, #
  813. 0x1e81, 499, #
  814. 0x1e83, 499, #
  815. 0x1e85, 499, #
  816. 0x1e87, 499, #
  817. 0x1e89, 499, #
  818. 0x1e8b, 499, #
  819. 0x1e8d, 499, #
  820. 0x1e8f, 499, #
  821. 0x1e91, 499, #
  822. 0x1e93, 499, #
  823. 0x1e95, 499, #
  824. 0x1ea1, 499, #
  825. 0x1ea3, 499, #
  826. 0x1ea5, 499, #
  827. 0x1ea7, 499, #
  828. 0x1ea9, 499, #
  829. 0x1eab, 499, #
  830. 0x1ead, 499, #
  831. 0x1eaf, 499, #
  832. 0x1eb1, 499, #
  833. 0x1eb3, 499, #
  834. 0x1eb5, 499, #
  835. 0x1eb7, 499, #
  836. 0x1eb9, 499, #
  837. 0x1ebb, 499, #
  838. 0x1ebd, 499, #
  839. 0x1ebf, 499, #
  840. 0x1ec1, 499, #
  841. 0x1ec3, 499, #
  842. 0x1ec5, 499, #
  843. 0x1ec7, 499, #
  844. 0x1ec9, 499, #
  845. 0x1ecb, 499, #
  846. 0x1ecd, 499, #
  847. 0x1ecf, 499, #
  848. 0x1ed1, 499, #
  849. 0x1ed3, 499, #
  850. 0x1ed5, 499, #
  851. 0x1ed7, 499, #
  852. 0x1ed9, 499, #
  853. 0x1edb, 499, #
  854. 0x1edd, 499, #
  855. 0x1edf, 499, #
  856. 0x1ee1, 499, #
  857. 0x1ee3, 499, #
  858. 0x1ee5, 499, #
  859. 0x1ee7, 499, #
  860. 0x1ee9, 499, #
  861. 0x1eeb, 499, #
  862. 0x1eed, 499, #
  863. 0x1eef, 499, #
  864. 0x1ef1, 499, #
  865. 0x1ef3, 499, #
  866. 0x1ef5, 499, #
  867. 0x1ef7, 499, #
  868. 0x1ef9, 499, #
  869. 0x1f51, 508, #
  870. 0x1f53, 508, #
  871. 0x1f55, 508, #
  872. 0x1f57, 508, #
  873. 0x1fb3, 509, #
  874. 0x1fc3, 509, #
  875. 0x1fe5, 507, #
  876. 0x1ff3, 509] #
  877. tolowerRanges = [
  878. 0x0041, 0x005a, 532, # A-Z a-z
  879. 0x00c0, 0x00d6, 532, # - -
  880. 0x00d8, 0x00de, 532, # - -
  881. 0x0189, 0x018a, 705, # - -
  882. 0x018e, 0x018f, 702, # - -
  883. 0x01b1, 0x01b2, 717, # - -
  884. 0x0388, 0x038a, 537, # - -
  885. 0x038e, 0x038f, 563, # - -
  886. 0x0391, 0x03a1, 532, # - -
  887. 0x03a3, 0x03ab, 532, # - -
  888. 0x0401, 0x040c, 580, # - -
  889. 0x040e, 0x040f, 580, # - -
  890. 0x0410, 0x042f, 532, # - -
  891. 0x0531, 0x0556, 548, # - -
  892. 0x10a0, 0x10c5, 548, # - -
  893. 0x1f08, 0x1f0f, 492, # - -
  894. 0x1f18, 0x1f1d, 492, # - -
  895. 0x1f28, 0x1f2f, 492, # - -
  896. 0x1f38, 0x1f3f, 492, # - -
  897. 0x1f48, 0x1f4d, 492, # - -
  898. 0x1f68, 0x1f6f, 492, # - -
  899. 0x1f88, 0x1f8f, 492, # - -
  900. 0x1f98, 0x1f9f, 492, # - -
  901. 0x1fa8, 0x1faf, 492, # - -
  902. 0x1fb8, 0x1fb9, 492, # - -
  903. 0x1fba, 0x1fbb, 426, # - -
  904. 0x1fc8, 0x1fcb, 414, # - -
  905. 0x1fd8, 0x1fd9, 492, # - -
  906. 0x1fda, 0x1fdb, 400, # - -
  907. 0x1fe8, 0x1fe9, 492, # - -
  908. 0x1fea, 0x1feb, 388, # - -
  909. 0x1ff8, 0x1ff9, 372, # - -
  910. 0x1ffa, 0x1ffb, 374, # - -
  911. 0x2160, 0x216f, 516, # - -
  912. 0x24b6, 0x24cf, 526, # - -
  913. 0xff21, 0xff3a, 532] # - -
  914. tolowerSinglets = [
  915. 0x0100, 501, #
  916. 0x0102, 501, #
  917. 0x0104, 501, #
  918. 0x0106, 501, #
  919. 0x0108, 501, #
  920. 0x010a, 501, #
  921. 0x010c, 501, #
  922. 0x010e, 501, #
  923. 0x0110, 501, #
  924. 0x0112, 501, #
  925. 0x0114, 501, #
  926. 0x0116, 501, #
  927. 0x0118, 501, #
  928. 0x011a, 501, #
  929. 0x011c, 501, #
  930. 0x011e, 501, #
  931. 0x0120, 501, #
  932. 0x0122, 501, #
  933. 0x0124, 501, #
  934. 0x0126, 501, #
  935. 0x0128, 501, #
  936. 0x012a, 501, #
  937. 0x012c, 501, #
  938. 0x012e, 501, #
  939. 0x0130, 301, # i
  940. 0x0132, 501, #
  941. 0x0134, 501, #
  942. 0x0136, 501, #
  943. 0x0139, 501, #
  944. 0x013b, 501, #
  945. 0x013d, 501, #
  946. 0x013f, 501, #
  947. 0x0141, 501, #
  948. 0x0143, 501, #
  949. 0x0145, 501, #
  950. 0x0147, 501, #
  951. 0x014a, 501, #
  952. 0x014c, 501, #
  953. 0x014e, 501, #
  954. 0x0150, 501, #
  955. 0x0152, 501, #
  956. 0x0154, 501, #
  957. 0x0156, 501, #
  958. 0x0158, 501, #
  959. 0x015a, 501, #
  960. 0x015c, 501, #
  961. 0x015e, 501, #
  962. 0x0160, 501, #
  963. 0x0162, 501, #
  964. 0x0164, 501, #
  965. 0x0166, 501, #
  966. 0x0168, 501, #
  967. 0x016a, 501, #
  968. 0x016c, 501, #
  969. 0x016e, 501, #
  970. 0x0170, 501, #
  971. 0x0172, 501, #
  972. 0x0174, 501, #
  973. 0x0176, 501, #
  974. 0x0178, 379, #
  975. 0x0179, 501, #
  976. 0x017b, 501, #
  977. 0x017d, 501, #
  978. 0x0181, 710, #
  979. 0x0182, 501, #
  980. 0x0184, 501, #
  981. 0x0186, 706, #
  982. 0x0187, 501, #
  983. 0x018b, 501, #
  984. 0x0190, 703, #
  985. 0x0191, 501, #
  986. 0x0193, 705, #
  987. 0x0194, 707, #
  988. 0x0196, 711, #
  989. 0x0197, 709, #
  990. 0x0198, 501, #
  991. 0x019c, 711, #
  992. 0x019d, 713, #
  993. 0x01a0, 501, #
  994. 0x01a2, 501, #
  995. 0x01a4, 501, #
  996. 0x01a7, 501, #
  997. 0x01a9, 718, #
  998. 0x01ac, 501, #
  999. 0x01ae, 718, #
  1000. 0x01af, 501, #
  1001. 0x01b3, 501, #
  1002. 0x01b5, 501, #
  1003. 0x01b7, 719, #
  1004. 0x01b8, 501, #
  1005. 0x01bc, 501, #
  1006. 0x01c4, 502, #
  1007. 0x01c5, 501, #
  1008. 0x01c7, 502, #
  1009. 0x01c8, 501, #
  1010. 0x01ca, 502, #
  1011. 0x01cb, 501, #
  1012. 0x01cd, 501, #
  1013. 0x01cf, 501, #
  1014. 0x01d1, 501, #
  1015. 0x01d3, 501, #
  1016. 0x01d5, 501, #
  1017. 0x01d7, 501, #
  1018. 0x01d9, 501, #
  1019. 0x01db, 501, #
  1020. 0x01de, 501, #
  1021. 0x01e0, 501, #
  1022. 0x01e2, 501, #
  1023. 0x01e4, 501, #
  1024. 0x01e6, 501, #
  1025. 0x01e8, 501, #
  1026. 0x01ea, 501, #
  1027. 0x01ec, 501, #
  1028. 0x01ee, 501, #
  1029. 0x01f1, 502, #
  1030. 0x01f2, 501, #
  1031. 0x01f4, 501, #
  1032. 0x01fa, 501, #
  1033. 0x01fc, 501, #
  1034. 0x01fe, 501, #
  1035. 0x0200, 501, #
  1036. 0x0202, 501, #
  1037. 0x0204, 501, #
  1038. 0x0206, 501, #
  1039. 0x0208, 501, #
  1040. 0x020a, 501, #
  1041. 0x020c, 501, #
  1042. 0x020e, 501, #
  1043. 0x0210, 501, #
  1044. 0x0212, 501, #
  1045. 0x0214, 501, #
  1046. 0x0216, 501, #
  1047. 0x0386, 538, #
  1048. 0x038c, 564, #
  1049. 0x03e2, 501, #
  1050. 0x03e4, 501, #
  1051. 0x03e6, 501, #
  1052. 0x03e8, 501, #
  1053. 0x03ea, 501, #
  1054. 0x03ec, 501, #
  1055. 0x03ee, 501, #
  1056. 0x0460, 501, #
  1057. 0x0462, 501, #
  1058. 0x0464, 501, #
  1059. 0x0466, 501, #
  1060. 0x0468, 501, #
  1061. 0x046a, 501, #
  1062. 0x046c, 501, #
  1063. 0x046e, 501, #
  1064. 0x0470, 501, #
  1065. 0x0472, 501, #
  1066. 0x0474, 501, #
  1067. 0x0476, 501, #
  1068. 0x0478, 501, #
  1069. 0x047a, 501, #
  1070. 0x047c, 501, #
  1071. 0x047e, 501, #
  1072. 0x0480, 501, #
  1073. 0x0490, 501, #
  1074. 0x0492, 501, #
  1075. 0x0494, 501, #
  1076. 0x0496, 501, #
  1077. 0x0498, 501, #
  1078. 0x049a, 501, #
  1079. 0x049c, 501, #
  1080. 0x049e, 501, #
  1081. 0x04a0, 501, #
  1082. 0x04a2, 501, #
  1083. 0x04a4, 501, #
  1084. 0x04a6, 501, #
  1085. 0x04a8, 501, #
  1086. 0x04aa, 501, #
  1087. 0x04ac, 501, #
  1088. 0x04ae, 501, #
  1089. 0x04b0, 501, #
  1090. 0x04b2, 501, #
  1091. 0x04b4, 501, #
  1092. 0x04b6, 501, #
  1093. 0x04b8, 501, #
  1094. 0x04ba, 501, #
  1095. 0x04bc, 501, #
  1096. 0x04be, 501, #
  1097. 0x04c1, 501, #
  1098. 0x04c3, 501, #
  1099. 0x04c7, 501, #
  1100. 0x04cb, 501, #
  1101. 0x04d0, 501, #
  1102. 0x04d2, 501, #
  1103. 0x04d4, 501, #
  1104. 0x04d6, 501, #
  1105. 0x04d8, 501, #
  1106. 0x04da, 501, #
  1107. 0x04dc, 501, #
  1108. 0x04de, 501, #
  1109. 0x04e0, 501, #
  1110. 0x04e2, 501, #
  1111. 0x04e4, 501, #
  1112. 0x04e6, 501, #
  1113. 0x04e8, 501, #
  1114. 0x04ea, 501, #
  1115. 0x04ee, 501, #
  1116. 0x04f0, 501, #
  1117. 0x04f2, 501, #
  1118. 0x04f4, 501, #
  1119. 0x04f8, 501, #
  1120. 0x1e00, 501, #
  1121. 0x1e02, 501, #
  1122. 0x1e04, 501, #
  1123. 0x1e06, 501, #
  1124. 0x1e08, 501, #
  1125. 0x1e0a, 501, #
  1126. 0x1e0c, 501, #
  1127. 0x1e0e, 501, #
  1128. 0x1e10, 501, #
  1129. 0x1e12, 501, #
  1130. 0x1e14, 501, #
  1131. 0x1e16, 501, #
  1132. 0x1e18, 501, #
  1133. 0x1e1a, 501, #
  1134. 0x1e1c, 501, #
  1135. 0x1e1e, 501, #
  1136. 0x1e20, 501, #
  1137. 0x1e22, 501, #
  1138. 0x1e24, 501, #
  1139. 0x1e26, 501, #
  1140. 0x1e28, 501, #
  1141. 0x1e2a, 501, #
  1142. 0x1e2c, 501, #
  1143. 0x1e2e, 501, #
  1144. 0x1e30, 501, #
  1145. 0x1e32, 501, #
  1146. 0x1e34, 501, #
  1147. 0x1e36, 501, #
  1148. 0x1e38, 501, #
  1149. 0x1e3a, 501, #
  1150. 0x1e3c, 501, #
  1151. 0x1e3e, 501, #
  1152. 0x1e40, 501, #
  1153. 0x1e42, 501, #
  1154. 0x1e44, 501, #
  1155. 0x1e46, 501, #
  1156. 0x1e48, 501, #
  1157. 0x1e4a, 501, #
  1158. 0x1e4c, 501, #
  1159. 0x1e4e, 501, #
  1160. 0x1e50, 501, #
  1161. 0x1e52, 501, #
  1162. 0x1e54, 501, #
  1163. 0x1e56, 501, #
  1164. 0x1e58, 501, #
  1165. 0x1e5a, 501, #
  1166. 0x1e5c, 501, #
  1167. 0x1e5e, 501, #
  1168. 0x1e60, 501, #
  1169. 0x1e62, 501, #
  1170. 0x1e64, 501, #
  1171. 0x1e66, 501, #
  1172. 0x1e68, 501, #
  1173. 0x1e6a, 501, #
  1174. 0x1e6c, 501, #
  1175. 0x1e6e, 501, #
  1176. 0x1e70, 501, #
  1177. 0x1e72, 501, #
  1178. 0x1e74, 501, #
  1179. 0x1e76, 501, #
  1180. 0x1e78, 501, #
  1181. 0x1e7a, 501, #
  1182. 0x1e7c, 501, #
  1183. 0x1e7e, 501, #
  1184. 0x1e80, 501, #
  1185. 0x1e82, 501, #
  1186. 0x1e84, 501, #
  1187. 0x1e86, 501, #
  1188. 0x1e88, 501, #
  1189. 0x1e8a, 501, #
  1190. 0x1e8c, 501, #
  1191. 0x1e8e, 501, #
  1192. 0x1e90, 501, #
  1193. 0x1e92, 501, #
  1194. 0x1e94, 501, #
  1195. 0x1ea0, 501, #
  1196. 0x1ea2, 501, #
  1197. 0x1ea4, 501, #
  1198. 0x1ea6, 501, #
  1199. 0x1ea8, 501, #
  1200. 0x1eaa, 501, #
  1201. 0x1eac, 501, #
  1202. 0x1eae, 501, #
  1203. 0x1eb0, 501, #
  1204. 0x1eb2, 501, #
  1205. 0x1eb4, 501, #
  1206. 0x1eb6, 501, #
  1207. 0x1eb8, 501, #
  1208. 0x1eba, 501, #
  1209. 0x1ebc, 501, #
  1210. 0x1ebe, 501, #
  1211. 0x1ec0, 501, #
  1212. 0x1ec2, 501, #
  1213. 0x1ec4, 501, #
  1214. 0x1ec6, 501, #
  1215. 0x1ec8, 501, #
  1216. 0x1eca, 501, #
  1217. 0x1ecc, 501, #
  1218. 0x1ece, 501, #
  1219. 0x1ed0, 501, #
  1220. 0x1ed2, 501, #
  1221. 0x1ed4, 501, #
  1222. 0x1ed6, 501, #
  1223. 0x1ed8, 501, #
  1224. 0x1eda, 501, #
  1225. 0x1edc, 501, #
  1226. 0x1ede, 501, #
  1227. 0x1ee0, 501, #
  1228. 0x1ee2, 501, #
  1229. 0x1ee4, 501, #
  1230. 0x1ee6, 501, #
  1231. 0x1ee8, 501, #
  1232. 0x1eea, 501, #
  1233. 0x1eec, 501, #
  1234. 0x1eee, 501, #
  1235. 0x1ef0, 501, #
  1236. 0x1ef2, 501, #
  1237. 0x1ef4, 501, #
  1238. 0x1ef6, 501, #
  1239. 0x1ef8, 501, #
  1240. 0x1f59, 492, #
  1241. 0x1f5b, 492, #
  1242. 0x1f5d, 492, #
  1243. 0x1f5f, 492, #
  1244. 0x1fbc, 491, #
  1245. 0x1fcc, 491, #
  1246. 0x1fec, 493, #
  1247. 0x1ffc, 491] #
  1248. toTitleSinglets = [
  1249. 0x01c4, 501, #
  1250. 0x01c6, 499, #
  1251. 0x01c7, 501, #
  1252. 0x01c9, 499, #
  1253. 0x01ca, 501, #
  1254. 0x01cc, 499, #
  1255. 0x01f1, 501, #
  1256. 0x01f3, 499] #
  1257. proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int =
  1258. var n = len
  1259. var t = 0
  1260. while n > 1:
  1261. var m = n div 2
  1262. var p = t + m*stride
  1263. if c >= tab[p]:
  1264. t = p
  1265. n = n-m
  1266. else:
  1267. n = m
  1268. if n != 0 and c >= tab[t]:
  1269. return t
  1270. return -1
  1271. proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} =
  1272. ## Converts ``c`` into lower case. This works for any Unicode character.
  1273. ## If possible, prefer ``toLower`` over ``toUpper``.
  1274. var c = RuneImpl(c)
  1275. var p = binarySearch(c, tolowerRanges, len(tolowerRanges) div 3, 3)
  1276. if p >= 0 and c >= tolowerRanges[p] and c <= tolowerRanges[p+1]:
  1277. return Rune(c + tolowerRanges[p+2] - 500)
  1278. p = binarySearch(c, tolowerSinglets, len(tolowerSinglets) div 2, 2)
  1279. if p >= 0 and c == tolowerSinglets[p]:
  1280. return Rune(c + tolowerSinglets[p+1] - 500)
  1281. return Rune(c)
  1282. proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} =
  1283. ## Converts ``c`` into upper case. This works for any Unicode character.
  1284. ## If possible, prefer ``toLower`` over ``toUpper``.
  1285. var c = RuneImpl(c)
  1286. var p = binarySearch(c, toupperRanges, len(toupperRanges) div 3, 3)
  1287. if p >= 0 and c >= toupperRanges[p] and c <= toupperRanges[p+1]:
  1288. return Rune(c + toupperRanges[p+2] - 500)
  1289. p = binarySearch(c, toupperSinglets, len(toupperSinglets) div 2, 2)
  1290. if p >= 0 and c == toupperSinglets[p]:
  1291. return Rune(c + toupperSinglets[p+1] - 500)
  1292. return Rune(c)
  1293. proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} =
  1294. ## Converts ``c`` to title case
  1295. var c = RuneImpl(c)
  1296. var p = binarySearch(c, toTitleSinglets, len(toTitleSinglets) div 2, 2)
  1297. if p >= 0 and c == toTitleSinglets[p]:
  1298. return Rune(c + toTitleSinglets[p+1] - 500)
  1299. return Rune(c)
  1300. proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1301. ## Returns true iff ``c`` is a lower case Unicode character.
  1302. ## If possible, prefer ``isLower`` over ``isUpper``.
  1303. var c = RuneImpl(c)
  1304. # Note: toUpperRanges is correct here!
  1305. var p = binarySearch(c, toupperRanges, len(toupperRanges) div 3, 3)
  1306. if p >= 0 and c >= toupperRanges[p] and c <= toupperRanges[p+1]:
  1307. return true
  1308. p = binarySearch(c, toupperSinglets, len(toupperSinglets) div 2, 2)
  1309. if p >= 0 and c == toupperSinglets[p]:
  1310. return true
  1311. proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1312. ## Returns true iff ``c`` is a upper case Unicode character.
  1313. ## If possible, prefer ``isLower`` over ``isUpper``.
  1314. var c = RuneImpl(c)
  1315. # Note: toLowerRanges is correct here!
  1316. var p = binarySearch(c, tolowerRanges, len(tolowerRanges) div 3, 3)
  1317. if p >= 0 and c >= tolowerRanges[p] and c <= tolowerRanges[p+1]:
  1318. return true
  1319. p = binarySearch(c, tolowerSinglets, len(tolowerSinglets) div 2, 2)
  1320. if p >= 0 and c == tolowerSinglets[p]:
  1321. return true
  1322. proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1323. ## Returns true iff ``c`` is an *alpha* Unicode character (i.e., a letter)
  1324. if isUpper(c) or isLower(c):
  1325. return true
  1326. var c = RuneImpl(c)
  1327. var p = binarySearch(c, alphaRanges, len(alphaRanges) div 2, 2)
  1328. if p >= 0 and c >= alphaRanges[p] and c <= alphaRanges[p+1]:
  1329. return true
  1330. p = binarySearch(c, alphaSinglets, len(alphaSinglets), 1)
  1331. if p >= 0 and c == alphaSinglets[p]:
  1332. return true
  1333. proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1334. ## Returns true iff ``c`` is a Unicode titlecase character
  1335. return isUpper(c) and isLower(c)
  1336. proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1337. ## Returns true iff ``c`` is a Unicode whitespace character
  1338. var c = RuneImpl(c)
  1339. var p = binarySearch(c, spaceRanges, len(spaceRanges) div 2, 2)
  1340. if p >= 0 and c >= spaceRanges[p] and c <= spaceRanges[p+1]:
  1341. return true
  1342. proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  1343. ## Returns true iff ``c`` is a Unicode combining character
  1344. var c = RuneImpl(c)
  1345. # Optimized to return false immediately for ASCII
  1346. return c >= 0x0300 and (c <= 0x036f or
  1347. (c >= 0x1ab0 and c <= 0x1aff) or
  1348. (c >= 0x1dc0 and c <= 0x1dff) or
  1349. (c >= 0x20d0 and c <= 0x20ff) or
  1350. (c >= 0xfe20 and c <= 0xfe2f))
  1351. template runeCheck(s, runeProc) =
  1352. ## Common code for rune.isLower, rune.isUpper, etc
  1353. result = if len(s) == 0: false else: true
  1354. var
  1355. i = 0
  1356. rune: Rune
  1357. while i < len(s) and result:
  1358. fastRuneAt(s, i, rune, doInc=true)
  1359. result = runeProc(rune) and result
  1360. proc isUpper*(s: string): bool {.noSideEffect, procvar,
  1361. rtl, extern: "nuc$1Str".} =
  1362. ## Returns true iff `s` contains all upper case unicode characters.
  1363. runeCheck(s, isUpper)
  1364. proc isLower*(s: string): bool {.noSideEffect, procvar,
  1365. rtl, extern: "nuc$1Str".} =
  1366. ## Returns true iff `s` contains all lower case unicode characters.
  1367. runeCheck(s, isLower)
  1368. proc isAlpha*(s: string): bool {.noSideEffect, procvar,
  1369. rtl, extern: "nuc$1Str".} =
  1370. ## Returns true iff `s` contains all alphabetic unicode characters.
  1371. runeCheck(s, isAlpha)
  1372. proc isSpace*(s: string): bool {.noSideEffect, procvar,
  1373. rtl, extern: "nuc$1Str".} =
  1374. ## Returns true iff `s` contains all whitespace unicode characters.
  1375. runeCheck(s, isWhiteSpace)
  1376. template convertRune(s, runeProc) =
  1377. ## Convert runes in `s` using `runeProc` as the converter.
  1378. result = newString(len(s))
  1379. var
  1380. i = 0
  1381. lastIndex = 0
  1382. rune: Rune
  1383. while i < len(s):
  1384. lastIndex = i
  1385. fastRuneAt(s, i, rune, doInc=true)
  1386. rune = runeProc(rune)
  1387. rune.fastToUTF8Copy(result, lastIndex)
  1388. proc toUpper*(s: string): string {.noSideEffect, procvar,
  1389. rtl, extern: "nuc$1Str".} =
  1390. ## Converts `s` into upper-case unicode characters.
  1391. convertRune(s, toUpper)
  1392. proc toLower*(s: string): string {.noSideEffect, procvar,
  1393. rtl, extern: "nuc$1Str".} =
  1394. ## Converts `s` into lower-case unicode characters.
  1395. convertRune(s, toLower)
  1396. proc swapCase*(s: string): string {.noSideEffect, procvar,
  1397. rtl, extern: "nuc$1".} =
  1398. ## Swaps the case of unicode characters in `s`
  1399. ##
  1400. ## Returns a new string such that the cases of all unicode characters
  1401. ## are swapped if possible
  1402. var
  1403. i = 0
  1404. lastIndex = 0
  1405. rune: Rune
  1406. result = newString(len(s))
  1407. while i < len(s):
  1408. lastIndex = i
  1409. fastRuneAt(s, i, rune)
  1410. if rune.isUpper():
  1411. rune = rune.toLower()
  1412. elif rune.isLower():
  1413. rune = rune.toUpper()
  1414. rune.fastToUTF8Copy(result, lastIndex)
  1415. proc capitalize*(s: string): string {.noSideEffect, procvar,
  1416. rtl, extern: "nuc$1".} =
  1417. ## Converts the first character of `s` into an upper-case unicode character.
  1418. if len(s) == 0:
  1419. return s
  1420. var
  1421. rune: Rune
  1422. i = 0
  1423. fastRuneAt(s, i, rune, doInc=true)
  1424. result = $toUpper(rune) & substr(s, i)
  1425. proc translate*(s: string, replacements: proc(key: string): string): string {.
  1426. rtl, extern: "nuc$1".} =
  1427. ## Translates words in a string using the `replacements` proc to substitute
  1428. ## words inside `s` with their replacements
  1429. ##
  1430. ## `replacements` is any proc that takes a word and returns
  1431. ## a new word to fill it's place.
  1432. # Allocate memory for the new string based on the old one.
  1433. # If the new string length is less than the old, no allocations
  1434. # will be needed. If the new string length is greater than the
  1435. # old, then maybe only one allocation is needed
  1436. result = newStringOfCap(s.len)
  1437. var
  1438. index = 0
  1439. lastIndex = 0
  1440. wordStart = 0
  1441. inWord = false
  1442. rune: Rune
  1443. while index < len(s):
  1444. lastIndex = index
  1445. fastRuneAt(s, index, rune)
  1446. let whiteSpace = rune.isWhiteSpace()
  1447. if whiteSpace and inWord:
  1448. # If we've reached the end of a word
  1449. let word = s[wordStart ..< lastIndex]
  1450. result.add(replacements(word))
  1451. result.add($rune)
  1452. inWord = false
  1453. elif not whiteSpace and not inWord:
  1454. # If we've hit a non space character and
  1455. # are not currently in a word, track
  1456. # the starting index of the word
  1457. inWord = true
  1458. wordStart = lastIndex
  1459. elif whiteSpace:
  1460. result.add($rune)
  1461. if wordStart < len(s) and inWord:
  1462. # Get the trailing word at the end
  1463. let word = s[wordStart .. ^1]
  1464. result.add(replacements(word))
  1465. proc title*(s: string): string {.noSideEffect, procvar,
  1466. rtl, extern: "nuc$1".} =
  1467. ## Converts `s` to a unicode title.
  1468. ##
  1469. ## Returns a new string such that the first character
  1470. ## in each word inside `s` is capitalized
  1471. var
  1472. i = 0
  1473. lastIndex = 0
  1474. rune: Rune
  1475. result = newString(len(s))
  1476. var firstRune = true
  1477. while i < len(s):
  1478. lastIndex = i
  1479. fastRuneAt(s, i, rune)
  1480. if not rune.isWhiteSpace() and firstRune:
  1481. rune = rune.toUpper()
  1482. firstRune = false
  1483. elif rune.isWhiteSpace():
  1484. firstRune = true
  1485. rune.fastToUTF8Copy(result, lastIndex)
  1486. proc isTitle*(s: string): bool {.noSideEffect, procvar,
  1487. rtl, extern: "nuc$1Str".}=
  1488. ## Checks whether or not `s` is a unicode title.
  1489. ##
  1490. ## Returns true if the first character in each word inside `s`
  1491. ## are upper case and there is at least one character in `s`.
  1492. if s.len() == 0:
  1493. return false
  1494. result = true
  1495. var
  1496. i = 0
  1497. rune: Rune
  1498. var firstRune = true
  1499. while i < len(s) and result:
  1500. fastRuneAt(s, i, rune, doInc=true)
  1501. if not rune.isWhiteSpace() and firstRune:
  1502. result = rune.isUpper() and result
  1503. firstRune = false
  1504. elif rune.isWhiteSpace():
  1505. firstRune = true
  1506. iterator runes*(s: string): Rune =
  1507. ## Iterates over any unicode character of the string ``s`` returning runes
  1508. var
  1509. i = 0
  1510. result: Rune
  1511. while i < len(s):
  1512. fastRuneAt(s, i, result, true)
  1513. yield result
  1514. iterator utf8*(s: string): string =
  1515. ## Iterates over any unicode character of the string ``s`` returning utf8 values
  1516. var o = 0
  1517. while o < s.len:
  1518. let n = runeLenAt(s, o)
  1519. yield s[o.. (o+n-1)]
  1520. o += n
  1521. proc toRunes*(s: string): seq[Rune] =
  1522. ## Obtains a sequence containing the Runes in ``s``
  1523. result = newSeq[Rune]()
  1524. for r in s.runes:
  1525. result.add(r)
  1526. proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1", procvar.} =
  1527. ## Compares two UTF-8 strings and ignores the case. Returns:
  1528. ##
  1529. ## | 0 iff a == b
  1530. ## | < 0 iff a < b
  1531. ## | > 0 iff a > b
  1532. var i = 0
  1533. var j = 0
  1534. var ar, br: Rune
  1535. while i < a.len and j < b.len:
  1536. # slow path:
  1537. fastRuneAt(a, i, ar)
  1538. fastRuneAt(b, j, br)
  1539. result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br))
  1540. if result != 0: return
  1541. result = a.len - b.len
  1542. proc reversed*(s: string): string =
  1543. ## Returns the reverse of ``s``, interpreting it as Unicode characters.
  1544. ## Unicode combining characters are correctly interpreted as well:
  1545. ##
  1546. ## .. code-block:: nim
  1547. ##
  1548. ## assert reversed("Reverse this!") == "!siht esreveR"
  1549. ## assert reversed("先秦兩漢") == "漢兩秦先"
  1550. ## assert reversed("as⃝df̅") == "f̅ds⃝a"
  1551. ## assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  1552. var
  1553. i = 0
  1554. lastI = 0
  1555. newPos = len(s) - 1
  1556. blockPos = 0
  1557. r: Rune
  1558. template reverseUntil(pos) =
  1559. var j = pos - 1
  1560. while j > blockPos:
  1561. result[newPos] = s[j]
  1562. dec j
  1563. dec newPos
  1564. blockPos = pos - 1
  1565. result = newString(len(s))
  1566. while i < len(s):
  1567. lastI = i
  1568. fastRuneAt(s, i, r, true)
  1569. if not isCombining(r):
  1570. reverseUntil(lastI)
  1571. reverseUntil(len(s))
  1572. proc graphemeLen*(s: string; i: Natural): Natural =
  1573. ## The number of bytes belonging to 's[i]' including following combining
  1574. ## characters.
  1575. var j = i.int
  1576. var r, r2: Rune
  1577. if j < s.len:
  1578. fastRuneAt(s, j, r, true)
  1579. result = j-i
  1580. while j < s.len:
  1581. fastRuneAt(s, j, r2, true)
  1582. if not isCombining(r2): break
  1583. result = j-i
  1584. proc lastRune*(s: string; last: int): (Rune, int) =
  1585. ## length of the last rune in 's[0..last]'. Returns the rune and its length
  1586. ## in bytes.
  1587. if s[last] <= chr(127):
  1588. result = (Rune(s[last]), 1)
  1589. else:
  1590. var L = 0
  1591. while last-L >= 0 and ord(s[last-L]) shr 6 == 0b10: inc(L)
  1592. var r: Rune
  1593. fastRuneAt(s, last-L, r, false)
  1594. result = (r, L+1)
  1595. when isMainModule:
  1596. let
  1597. someString = "öÑ"
  1598. someRunes = @[runeAt(someString, 0), runeAt(someString, 2)]
  1599. compared = (someString == $someRunes)
  1600. doAssert compared == true
  1601. proc test_replacements(word: string): string =
  1602. case word
  1603. of "two":
  1604. return "2"
  1605. of "foo":
  1606. return "BAR"
  1607. of "βeta":
  1608. return "beta"
  1609. of "alpha":
  1610. return "αlpha"
  1611. else:
  1612. return "12345"
  1613. doAssert translate("two not alpha foo βeta", test_replacements) == "2 12345 αlpha BAR beta"
  1614. doAssert translate(" two not foo βeta ", test_replacements) == " 2 12345 BAR beta "
  1615. doAssert title("foo bar") == "Foo Bar"
  1616. doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
  1617. doAssert title("") == ""
  1618. doAssert capitalize("βeta") == "Βeta"
  1619. doAssert capitalize("foo") == "Foo"
  1620. doAssert capitalize("") == ""
  1621. doAssert isTitle("Foo")
  1622. doAssert(not isTitle("Foo bar"))
  1623. doAssert(not isTitle("αlpha Βeta"))
  1624. doAssert(isTitle("Αlpha Βeta Γamma"))
  1625. doAssert(not isTitle("fFoo"))
  1626. doAssert swapCase("FooBar") == "fOObAR"
  1627. doAssert swapCase(" ") == " "
  1628. doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
  1629. doAssert swapCase("a✓B") == "A✓b"
  1630. doAssert swapCase("") == ""
  1631. doAssert isAlpha("r")
  1632. doAssert isAlpha("α")
  1633. doAssert(not isAlpha("$"))
  1634. doAssert(not isAlpha(""))
  1635. doAssert isAlpha("Βeta")
  1636. doAssert isAlpha("Args")
  1637. doAssert(not isAlpha("$Foo✓"))
  1638. doAssert isSpace("\t")
  1639. doAssert isSpace("\l")
  1640. doAssert(not isSpace("Β"))
  1641. doAssert(not isSpace("Βeta"))
  1642. doAssert isSpace("\t\l \v\r\f")
  1643. doAssert isSpace(" ")
  1644. doAssert(not isSpace(""))
  1645. doAssert(not isSpace("ΑΓc \td"))
  1646. doAssert isLower("a")
  1647. doAssert isLower("γ")
  1648. doAssert(not isLower("Γ"))
  1649. doAssert(not isLower("4"))
  1650. doAssert(not isLower(""))
  1651. doAssert isLower("abcdγ")
  1652. doAssert(not isLower("abCDΓ"))
  1653. doAssert(not isLower("33aaΓ"))
  1654. doAssert isUpper("Γ")
  1655. doAssert(not isUpper("b"))
  1656. doAssert(not isUpper("α"))
  1657. doAssert(not isUpper("✓"))
  1658. doAssert(not isUpper(""))
  1659. doAssert isUpper("ΑΒΓ")
  1660. doAssert(not isUpper("AAccβ"))
  1661. doAssert(not isUpper("A#$β"))
  1662. doAssert toUpper("Γ") == "Γ"
  1663. doAssert toUpper("b") == "B"
  1664. doAssert toUpper("α") == "Α"
  1665. doAssert toUpper("✓") == "✓"
  1666. doAssert toUpper("") == ""
  1667. doAssert toUpper("ΑΒΓ") == "ΑΒΓ"
  1668. doAssert toUpper("AAccβ") == "AACCΒ"
  1669. doAssert toUpper("A✓$β") == "A✓$Β"
  1670. doAssert toLower("a") == "a"
  1671. doAssert toLower("γ") == "γ"
  1672. doAssert toLower("Γ") == "γ"
  1673. doAssert toLower("4") == "4"
  1674. doAssert toLower("") == ""
  1675. doAssert toLower("abcdγ") == "abcdγ"
  1676. doAssert toLower("abCDΓ") == "abcdγ"
  1677. doAssert toLower("33aaΓ") == "33aaγ"
  1678. doAssert reversed("Reverse this!") == "!siht esreveR"
  1679. doAssert reversed("先秦兩漢") == "漢兩秦先"
  1680. doAssert reversed("as⃝df̅") == "f̅ds⃝a"
  1681. doAssert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  1682. doAssert len(toRunes("as⃝df̅")) == runeLen("as⃝df̅")
  1683. const test = "as⃝"
  1684. doAssert lastRune(test, test.len-1)[1] == 3
  1685. doAssert graphemeLen("è", 0) == 2
  1686. # test for rune positioning and runeSubStr()
  1687. let s = "Hänsel ««: 10,00€"
  1688. var t = ""
  1689. for c in s.utf8:
  1690. t.add c
  1691. doAssert(s == t)
  1692. doAssert(runeReverseOffset(s, 1) == (20, 18))
  1693. doAssert(runeReverseOffset(s, 19) == (-1, 18))
  1694. doAssert(runeStrAtPos(s, 0) == "H")
  1695. doAssert(runeSubStr(s, 0, 1) == "H")
  1696. doAssert(runeStrAtPos(s, 10) == ":")
  1697. doAssert(runeSubStr(s, 10, 1) == ":")
  1698. doAssert(runeStrAtPos(s, 9) == "«")
  1699. doAssert(runeSubStr(s, 9, 1) == "«")
  1700. doAssert(runeStrAtPos(s, 17) == "€")
  1701. doAssert(runeSubStr(s, 17, 1) == "€")
  1702. # echo runeStrAtPos(s, 18) # index error
  1703. doAssert(runeSubStr(s, 0) == "Hänsel ««: 10,00€")
  1704. doAssert(runeSubStr(s, -18) == "Hänsel ««: 10,00€")
  1705. doAssert(runeSubStr(s, 10) == ": 10,00€")
  1706. doAssert(runeSubStr(s, 18) == "")
  1707. doAssert(runeSubStr(s, 0, 10) == "Hänsel ««")
  1708. doAssert(runeSubStr(s, 12) == "10,00€")
  1709. doAssert(runeSubStr(s, -6) == "10,00€")
  1710. doAssert(runeSubStr(s, 12, 5) == "10,00")
  1711. doAssert(runeSubStr(s, 12, -1) == "10,00")
  1712. doAssert(runeSubStr(s, -6, 5) == "10,00")
  1713. doAssert(runeSubStr(s, -6, -1) == "10,00")
  1714. doAssert(runeSubStr(s, 0, 100) == "Hänsel ««: 10,00€")
  1715. doAssert(runeSubStr(s, -100, 100) == "Hänsel ««: 10,00€")
  1716. doAssert(runeSubStr(s, 0, -100) == "")
  1717. doAssert(runeSubStr(s, 100, -100) == "")