unicode.nim 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module provides support to handle the Unicode UTF-8 encoding.
  10. ##
  11. ## There are no specialized ``insert``, ``delete``, ``add`` and ``contains``
  12. ## procedures for ``seq[Rune]`` in this module because the generic variants
  13. ## of these procedures in the system module already work with it.
  14. ##
  15. ## The current version is compatible with Unicode v12.0.0.
  16. ##
  17. ## **See also:**
  18. ## * `strutils module <strutils.html>`_
  19. ## * `unidecode module <unidecode.html>`_
  20. ## * `encodings module <encodings.html>`_
  21. {.deadCodeElim: on.} # dce option deprecated
  22. include "system/inclrtl"
  23. type
  24. RuneImpl = int32 # underlying type of Rune
  25. Rune* = distinct RuneImpl ## \
  26. ## Type that can hold a single Unicode code point.
  27. ##
  28. ## A Rune may be composed with other Runes to a character on the screen.
  29. template ones(n: untyped): untyped = ((1 shl n)-1)
  30. proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} =
  31. ## Returns the number of runes of the string ``s``.
  32. runnableExamples:
  33. let a = "añyóng"
  34. doAssert a.runeLen == 6
  35. ## note: a.len == 8
  36. var i = 0
  37. while i < len(s):
  38. if uint(s[i]) <= 127: inc(i)
  39. elif uint(s[i]) shr 5 == 0b110: inc(i, 2)
  40. elif uint(s[i]) shr 4 == 0b1110: inc(i, 3)
  41. elif uint(s[i]) shr 3 == 0b11110: inc(i, 4)
  42. elif uint(s[i]) shr 2 == 0b111110: inc(i, 5)
  43. elif uint(s[i]) shr 1 == 0b1111110: inc(i, 6)
  44. else: inc i
  45. inc(result)
  46. proc runeLenAt*(s: string, i: Natural): int =
  47. ## Returns the number of bytes the rune starting at ``s[i]`` takes.
  48. ##
  49. ## See also:
  50. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  51. runnableExamples:
  52. let a = "añyóng"
  53. doAssert a.runeLenAt(0) == 1
  54. doAssert a.runeLenAt(1) == 2
  55. if uint(s[i]) <= 127: result = 1
  56. elif uint(s[i]) shr 5 == 0b110: result = 2
  57. elif uint(s[i]) shr 4 == 0b1110: result = 3
  58. elif uint(s[i]) shr 3 == 0b11110: result = 4
  59. elif uint(s[i]) shr 2 == 0b111110: result = 5
  60. elif uint(s[i]) shr 1 == 0b1111110: result = 6
  61. else: result = 1
  62. const replRune = Rune(0xFFFD)
  63. template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) =
  64. ## Returns the rune ``s[i]`` in ``result``.
  65. ##
  66. ## If ``doInc == true`` (default), ``i`` is incremented by the number
  67. ## of bytes that have been processed.
  68. bind ones
  69. if uint(s[i]) <= 127:
  70. result = Rune(uint(s[i]))
  71. when doInc: inc(i)
  72. elif uint(s[i]) shr 5 == 0b110:
  73. # assert(uint(s[i+1]) shr 6 == 0b10)
  74. if i <= s.len - 2:
  75. result = Rune((uint(s[i]) and (ones(5))) shl 6 or
  76. (uint(s[i+1]) and ones(6)))
  77. when doInc: inc(i, 2)
  78. else:
  79. result = replRune
  80. when doInc: inc(i)
  81. elif uint(s[i]) shr 4 == 0b1110:
  82. # assert(uint(s[i+1]) shr 6 == 0b10)
  83. # assert(uint(s[i+2]) shr 6 == 0b10)
  84. if i <= s.len - 3:
  85. result = Rune((uint(s[i]) and ones(4)) shl 12 or
  86. (uint(s[i+1]) and ones(6)) shl 6 or
  87. (uint(s[i+2]) and ones(6)))
  88. when doInc: inc(i, 3)
  89. else:
  90. result = replRune
  91. when doInc: inc(i)
  92. elif uint(s[i]) shr 3 == 0b11110:
  93. # assert(uint(s[i+1]) shr 6 == 0b10)
  94. # assert(uint(s[i+2]) shr 6 == 0b10)
  95. # assert(uint(s[i+3]) shr 6 == 0b10)
  96. if i <= s.len - 4:
  97. result = Rune((uint(s[i]) and ones(3)) shl 18 or
  98. (uint(s[i+1]) and ones(6)) shl 12 or
  99. (uint(s[i+2]) and ones(6)) shl 6 or
  100. (uint(s[i+3]) and ones(6)))
  101. when doInc: inc(i, 4)
  102. else:
  103. result = replRune
  104. when doInc: inc(i)
  105. elif uint(s[i]) shr 2 == 0b111110:
  106. # assert(uint(s[i+1]) shr 6 == 0b10)
  107. # assert(uint(s[i+2]) shr 6 == 0b10)
  108. # assert(uint(s[i+3]) shr 6 == 0b10)
  109. # assert(uint(s[i+4]) shr 6 == 0b10)
  110. if i <= s.len - 5:
  111. result = Rune((uint(s[i]) and ones(2)) shl 24 or
  112. (uint(s[i+1]) and ones(6)) shl 18 or
  113. (uint(s[i+2]) and ones(6)) shl 12 or
  114. (uint(s[i+3]) and ones(6)) shl 6 or
  115. (uint(s[i+4]) and ones(6)))
  116. when doInc: inc(i, 5)
  117. else:
  118. result = replRune
  119. when doInc: inc(i)
  120. elif uint(s[i]) shr 1 == 0b1111110:
  121. # assert(uint(s[i+1]) shr 6 == 0b10)
  122. # assert(uint(s[i+2]) shr 6 == 0b10)
  123. # assert(uint(s[i+3]) shr 6 == 0b10)
  124. # assert(uint(s[i+4]) shr 6 == 0b10)
  125. # assert(uint(s[i+5]) shr 6 == 0b10)
  126. if i <= s.len - 6:
  127. result = Rune((uint(s[i]) and ones(1)) shl 30 or
  128. (uint(s[i+1]) and ones(6)) shl 24 or
  129. (uint(s[i+2]) and ones(6)) shl 18 or
  130. (uint(s[i+3]) and ones(6)) shl 12 or
  131. (uint(s[i+4]) and ones(6)) shl 6 or
  132. (uint(s[i+5]) and ones(6)))
  133. when doInc: inc(i, 6)
  134. else:
  135. result = replRune
  136. when doInc: inc(i)
  137. else:
  138. result = Rune(uint(s[i]))
  139. when doInc: inc(i)
  140. proc runeAt*(s: string, i: Natural): Rune =
  141. ## Returns the rune in ``s`` at **byte index** ``i``.
  142. ##
  143. ## See also:
  144. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  145. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  146. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  147. runnableExamples:
  148. let a = "añyóng"
  149. doAssert a.runeAt(1) == "ñ".runeAt(0)
  150. doAssert a.runeAt(2) == "ñ".runeAt(1)
  151. doAssert a.runeAt(3) == "y".runeAt(0)
  152. fastRuneAt(s, i, result, false)
  153. proc validateUtf8*(s: string): int =
  154. ## Returns the position of the invalid byte in ``s`` if the string ``s`` does
  155. ## not hold valid UTF-8 data. Otherwise ``-1`` is returned.
  156. ##
  157. ## See also:
  158. ## * `toUTF8 proc <#toUTF8,Rune>`_
  159. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  160. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  161. var i = 0
  162. let L = s.len
  163. while i < L:
  164. if uint(s[i]) <= 127:
  165. inc(i)
  166. elif uint(s[i]) shr 5 == 0b110:
  167. if uint(s[i]) < 0xc2: return i # Catch overlong ascii representations.
  168. if i+1 < L and uint(s[i+1]) shr 6 == 0b10: inc(i, 2)
  169. else: return i
  170. elif uint(s[i]) shr 4 == 0b1110:
  171. if i+2 < L and uint(s[i+1]) shr 6 == 0b10 and uint(s[i+2]) shr 6 == 0b10:
  172. inc i, 3
  173. else: return i
  174. elif uint(s[i]) shr 3 == 0b11110:
  175. if i+3 < L and uint(s[i+1]) shr 6 == 0b10 and
  176. uint(s[i+2]) shr 6 == 0b10 and
  177. uint(s[i+3]) shr 6 == 0b10:
  178. inc i, 4
  179. else: return i
  180. else:
  181. return i
  182. return -1
  183. template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) =
  184. ## Copies UTF-8 representation of ``c`` into the preallocated string ``s``
  185. ## starting at position ``pos``.
  186. ##
  187. ## If ``doInc == true`` (default), ``pos`` is incremented
  188. ## by the number of bytes that have been processed.
  189. ##
  190. ## To be the most efficient, make sure ``s`` is preallocated
  191. ## with an additional amount equal to the byte length of ``c``.
  192. ##
  193. ## See also:
  194. ## * `validateUtf8 proc <#validateUtf8,string>`_
  195. ## * `toUTF8 proc <#toUTF8,Rune>`_
  196. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  197. var i = RuneImpl(c)
  198. if i <=% 127:
  199. s.setLen(pos+1)
  200. s[pos+0] = chr(i)
  201. when doInc: inc(pos)
  202. elif i <=% 0x07FF:
  203. s.setLen(pos+2)
  204. s[pos+0] = chr((i shr 6) or 0b110_00000)
  205. s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
  206. when doInc: inc(pos, 2)
  207. elif i <=% 0xFFFF:
  208. s.setLen(pos+3)
  209. s[pos+0] = chr(i shr 12 or 0b1110_0000)
  210. s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  211. s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
  212. when doInc: inc(pos, 3)
  213. elif i <=% 0x001FFFFF:
  214. s.setLen(pos+4)
  215. s[pos+0] = chr(i shr 18 or 0b1111_0000)
  216. s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  217. s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  218. s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
  219. when doInc: inc(pos, 4)
  220. elif i <=% 0x03FFFFFF:
  221. s.setLen(pos+5)
  222. s[pos+0] = chr(i shr 24 or 0b111110_00)
  223. s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  224. s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  225. s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  226. s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
  227. when doInc: inc(pos, 5)
  228. elif i <=% 0x7FFFFFFF:
  229. s.setLen(pos+6)
  230. s[pos+0] = chr(i shr 30 or 0b1111110_0)
  231. s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
  232. s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  233. s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  234. s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  235. s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
  236. when doInc: inc(pos, 6)
  237. else:
  238. discard # error, exception?
  239. proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} =
  240. ## Converts a rune into its UTF-8 representation.
  241. ##
  242. ## See also:
  243. ## * `validateUtf8 proc <#validateUtf8,string>`_
  244. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  245. ## * `utf8 iterator <#utf8.i,string>`_
  246. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  247. runnableExamples:
  248. let a = "añyóng"
  249. doAssert a.runeAt(1).toUTF8 == "ñ"
  250. result = ""
  251. fastToUTF8Copy(c, result, 0, false)
  252. proc add*(s: var string; c: Rune) =
  253. ## Adds a rune ``c`` to a string ``s``.
  254. runnableExamples:
  255. var s = "abc"
  256. let c = "ä".runeAt(0)
  257. s.add(c)
  258. doAssert s == "abcä"
  259. let pos = s.len
  260. fastToUTF8Copy(c, s, pos, false)
  261. proc `$`*(rune: Rune): string =
  262. ## An alias for `toUTF8 <#toUTF8,Rune>`_.
  263. ##
  264. ## See also:
  265. ## * `validateUtf8 proc <#validateUtf8,string>`_
  266. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  267. rune.toUTF8
  268. proc `$`*(runes: seq[Rune]): string =
  269. ## Converts a sequence of Runes to a string.
  270. ##
  271. ## See also:
  272. ## * `toRunes <#toRunes,string>`_ for a reverse operation
  273. runnableExamples:
  274. let
  275. someString = "öÑ"
  276. someRunes = toRunes(someString)
  277. doAssert $someRunes == someString
  278. result = ""
  279. for rune in runes:
  280. result.add rune
  281. proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int =
  282. ## Returns the byte position of rune
  283. ## at position ``pos`` in ``s`` with an optional start byte position.
  284. ## Returns the special value -1 if it runs out of the string.
  285. ##
  286. ## **Beware:** This can lead to unoptimized code and slow execution!
  287. ## Most problems can be solved more efficiently by using an iterator
  288. ## or conversion to a seq of Rune.
  289. ##
  290. ## See also:
  291. ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_
  292. runnableExamples:
  293. let a = "añyóng"
  294. doAssert a.runeOffset(1) == 1
  295. doAssert a.runeOffset(3) == 4
  296. doAssert a.runeOffset(4) == 6
  297. var
  298. i = 0
  299. o = start
  300. while i < pos:
  301. o += runeLenAt(s, o)
  302. if o >= s.len:
  303. return -1
  304. inc i
  305. return o
  306. proc runeReverseOffset*(s: string, rev: Positive): (int, int) =
  307. ## Returns a tuple with the byte offset of the
  308. ## rune at position ``rev`` in ``s``, counting
  309. ## from the end (starting with 1) and the total
  310. ## number of runes in the string.
  311. ##
  312. ## Returns a negative value for offset if there are to few runes in
  313. ## the string to satisfy the request.
  314. ##
  315. ## **Beware:** This can lead to unoptimized code and slow execution!
  316. ## Most problems can be solved more efficiently by using an iterator
  317. ## or conversion to a seq of Rune.
  318. ##
  319. ## See also:
  320. ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_
  321. var
  322. a = rev.int
  323. o = 0
  324. x = 0
  325. while o < s.len:
  326. let r = runeLenAt(s, o)
  327. o += r
  328. if a < 0:
  329. x += r
  330. dec a
  331. if a > 0:
  332. return (-a, rev.int-a)
  333. return (x, -a+rev.int)
  334. proc runeAtPos*(s: string, pos: int): Rune =
  335. ## Returns the rune at position ``pos``.
  336. ##
  337. ## **Beware:** This can lead to unoptimized code and slow execution!
  338. ## Most problems can be solved more efficiently by using an iterator
  339. ## or conversion to a seq of Rune.
  340. ##
  341. ## See also:
  342. ## * `runeAt proc <#runeAt,string,Natural>`_
  343. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  344. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  345. fastRuneAt(s, runeOffset(s, pos), result, false)
  346. proc runeStrAtPos*(s: string, pos: Natural): string =
  347. ## Returns the rune at position ``pos`` as UTF8 String.
  348. ##
  349. ## **Beware:** This can lead to unoptimized code and slow execution!
  350. ## Most problems can be solved more efficiently by using an iterator
  351. ## or conversion to a seq of Rune.
  352. ##
  353. ## See also:
  354. ## * `runeAt proc <#runeAt,string,Natural>`_
  355. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  356. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  357. let o = runeOffset(s, pos)
  358. s[o .. (o+runeLenAt(s, o)-1)]
  359. proc runeSubStr*(s: string, pos: int, len: int = int.high): string =
  360. ## Returns the UTF-8 substring starting at code point ``pos``
  361. ## with ``len`` code points.
  362. ##
  363. ## If ``pos`` or ``len`` is negative they count from
  364. ## the end of the string. If ``len`` is not given it means the longest
  365. ## possible string.
  366. runnableExamples:
  367. let s = "Hänsel ««: 10,00€"
  368. doAssert(runeSubStr(s, 0, 2) == "Hä")
  369. doAssert(runeSubStr(s, 10, 1) == ":")
  370. doAssert(runeSubStr(s, -6) == "10,00€")
  371. doAssert(runeSubStr(s, 10) == ": 10,00€")
  372. doAssert(runeSubStr(s, 12, 5) == "10,00")
  373. doAssert(runeSubStr(s, -6, 3) == "10,")
  374. if pos < 0:
  375. let (o, rl) = runeReverseOffset(s, -pos)
  376. if len >= rl:
  377. result = s.substr(o, s.len-1)
  378. elif len < 0:
  379. let e = rl + len
  380. if e < 0:
  381. result = ""
  382. else:
  383. result = s.substr(o, runeOffset(s, e-(rl+pos), o)-1)
  384. else:
  385. result = s.substr(o, runeOffset(s, len, o)-1)
  386. else:
  387. let o = runeOffset(s, pos)
  388. if o < 0:
  389. result = ""
  390. elif len == int.high:
  391. result = s.substr(o, s.len-1)
  392. elif len < 0:
  393. let (e, rl) = runeReverseOffset(s, -len)
  394. discard rl
  395. if e <= 0:
  396. result = ""
  397. else:
  398. result = s.substr(o, e-1)
  399. else:
  400. var e = runeOffset(s, len, o)
  401. if e < 0:
  402. e = s.len
  403. result = s.substr(o, e-1)
  404. proc `<=%`*(a, b: Rune): bool =
  405. ## Checks if code point of `a` is smaller or equal to code point of `b`.
  406. runnableExamples:
  407. let
  408. a = "ú".runeAt(0)
  409. b = "ü".runeAt(0)
  410. doAssert a <=% b
  411. return int(a) <=% int(b)
  412. proc `<%`*(a, b: Rune): bool =
  413. ## Checks if code point of `a` is smaller than code point of `b`.
  414. runnableExamples:
  415. let
  416. a = "ú".runeAt(0)
  417. b = "ü".runeAt(0)
  418. doAssert a <% b
  419. return int(a) <% int(b)
  420. proc `==`*(a, b: Rune): bool =
  421. ## Checks if two runes are equal.
  422. return int(a) == int(b)
  423. include "includes/unicode_ranges"
  424. proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int =
  425. var n = len
  426. var t = 0
  427. while n > 1:
  428. var m = n div 2
  429. var p = t + m*stride
  430. if c >= tab[p]:
  431. t = p
  432. n = n-m
  433. else:
  434. n = m
  435. if n != 0 and c >= tab[t]:
  436. return t
  437. return -1
  438. proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} =
  439. ## Converts ``c`` into lower case. This works for any rune.
  440. ##
  441. ## If possible, prefer ``toLower`` over ``toUpper``.
  442. ##
  443. ## See also:
  444. ## * `toUpper proc <#toUpper,Rune>`_
  445. ## * `toTitle proc <#toTitle,Rune>`_
  446. ## * `isLower proc <#isLower,Rune>`_
  447. var c = RuneImpl(c)
  448. var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3)
  449. if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]:
  450. return Rune(c + toLowerRanges[p+2] - 500)
  451. p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2)
  452. if p >= 0 and c == toLowerSinglets[p]:
  453. return Rune(c + toLowerSinglets[p+1] - 500)
  454. return Rune(c)
  455. proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} =
  456. ## Converts ``c`` into upper case. This works for any rune.
  457. ##
  458. ## If possible, prefer ``toLower`` over ``toUpper``.
  459. ##
  460. ## See also:
  461. ## * `toLower proc <#toLower,Rune>`_
  462. ## * `toTitle proc <#toTitle,Rune>`_
  463. ## * `isUpper proc <#isUpper,Rune>`_
  464. var c = RuneImpl(c)
  465. var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3)
  466. if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]:
  467. return Rune(c + toUpperRanges[p+2] - 500)
  468. p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2)
  469. if p >= 0 and c == toUpperSinglets[p]:
  470. return Rune(c + toUpperSinglets[p+1] - 500)
  471. return Rune(c)
  472. proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1", procvar.} =
  473. ## Converts ``c`` to title case.
  474. ##
  475. ## See also:
  476. ## * `toLower proc <#toLower,Rune>`_
  477. ## * `toUpper proc <#toUpper,Rune>`_
  478. ## * `isTitle proc <#isTitle,Rune>`_
  479. var c = RuneImpl(c)
  480. var p = binarySearch(c, toTitleSinglets, len(toTitleSinglets) div 2, 2)
  481. if p >= 0 and c == toTitleSinglets[p]:
  482. return Rune(c + toTitleSinglets[p+1] - 500)
  483. return Rune(c)
  484. proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  485. ## Returns true if ``c`` is a lower case rune.
  486. ##
  487. ## If possible, prefer ``isLower`` over ``isUpper``.
  488. ##
  489. ## See also:
  490. ## * `toLower proc <#toLower,Rune>`_
  491. ## * `isUpper proc <#isUpper,Rune>`_
  492. ## * `isTitle proc <#isTitle,Rune>`_
  493. var c = RuneImpl(c)
  494. # Note: toUpperRanges is correct here!
  495. var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3)
  496. if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]:
  497. return true
  498. p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2)
  499. if p >= 0 and c == toUpperSinglets[p]:
  500. return true
  501. proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  502. ## Returns true if ``c`` is a upper case rune.
  503. ##
  504. ## If possible, prefer ``isLower`` over ``isUpper``.
  505. ##
  506. ## See also:
  507. ## * `toUpper proc <#toUpper,Rune>`_
  508. ## * `isLower proc <#isLower,Rune>`_
  509. ## * `isTitle proc <#isTitle,Rune>`_
  510. ## * `isAlpha proc <#isAlpha,Rune>`_
  511. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  512. var c = RuneImpl(c)
  513. # Note: toLowerRanges is correct here!
  514. var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3)
  515. if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]:
  516. return true
  517. p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2)
  518. if p >= 0 and c == toLowerSinglets[p]:
  519. return true
  520. proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  521. ## Returns true if ``c`` is an *alpha* rune (i.e., a letter).
  522. ##
  523. ## See also:
  524. ## * `isLower proc <#isLower,Rune>`_
  525. ## * `isTitle proc <#isTitle,Rune>`_
  526. ## * `isAlpha proc <#isAlpha,Rune>`_
  527. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  528. ## * `isCombining proc <#isCombining,Rune>`_
  529. if isUpper(c) or isLower(c):
  530. return true
  531. var c = RuneImpl(c)
  532. var p = binarySearch(c, alphaRanges, len(alphaRanges) div 2, 2)
  533. if p >= 0 and c >= alphaRanges[p] and c <= alphaRanges[p+1]:
  534. return true
  535. p = binarySearch(c, alphaSinglets, len(alphaSinglets), 1)
  536. if p >= 0 and c == alphaSinglets[p]:
  537. return true
  538. proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  539. ## Returns true if ``c`` is a Unicode titlecase code point.
  540. ##
  541. ## See also:
  542. ## * `toTitle proc <#toTitle,Rune>`_
  543. ## * `isLower proc <#isLower,Rune>`_
  544. ## * `isUpper proc <#isUpper,Rune>`_
  545. ## * `isAlpha proc <#isAlpha,Rune>`_
  546. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  547. return isUpper(c) and isLower(c)
  548. proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  549. ## Returns true if ``c`` is a Unicode whitespace code point.
  550. ##
  551. ## See also:
  552. ## * `isLower proc <#isLower,Rune>`_
  553. ## * `isUpper proc <#isUpper,Rune>`_
  554. ## * `isTitle proc <#isTitle,Rune>`_
  555. ## * `isAlpha proc <#isAlpha,Rune>`_
  556. var c = RuneImpl(c)
  557. var p = binarySearch(c, spaceRanges, len(spaceRanges) div 2, 2)
  558. if p >= 0 and c >= spaceRanges[p] and c <= spaceRanges[p+1]:
  559. return true
  560. proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1", procvar.} =
  561. ## Returns true if ``c`` is a Unicode combining code unit.
  562. ##
  563. ## See also:
  564. ## * `isLower proc <#isLower,Rune>`_
  565. ## * `isUpper proc <#isUpper,Rune>`_
  566. ## * `isTitle proc <#isTitle,Rune>`_
  567. ## * `isAlpha proc <#isAlpha,Rune>`_
  568. var c = RuneImpl(c)
  569. # Optimized to return false immediately for ASCII
  570. return c >= 0x0300 and (c <= 0x036f or
  571. (c >= 0x1ab0 and c <= 0x1aff) or
  572. (c >= 0x1dc0 and c <= 0x1dff) or
  573. (c >= 0x20d0 and c <= 0x20ff) or
  574. (c >= 0xfe20 and c <= 0xfe2f))
  575. template runeCheck(s, runeProc) =
  576. ## Common code for isAlpha and isSpace.
  577. result = if len(s) == 0: false else: true
  578. var
  579. i = 0
  580. rune: Rune
  581. while i < len(s) and result:
  582. fastRuneAt(s, i, rune, doInc = true)
  583. result = runeProc(rune) and result
  584. proc isAlpha*(s: string): bool {.noSideEffect, procvar,
  585. rtl, extern: "nuc$1Str".} =
  586. ## Returns true if ``s`` contains all alphabetic runes.
  587. runnableExamples:
  588. let a = "añyóng"
  589. doAssert a.isAlpha
  590. runeCheck(s, isAlpha)
  591. proc isSpace*(s: string): bool {.noSideEffect, procvar,
  592. rtl, extern: "nuc$1Str".} =
  593. ## Returns true if ``s`` contains all whitespace runes.
  594. runnableExamples:
  595. let a = "\t\l \v\r\f"
  596. doAssert a.isSpace
  597. runeCheck(s, isWhiteSpace)
  598. template convertRune(s, runeProc) =
  599. ## Convert runes in ``s`` using ``runeProc`` as the converter.
  600. result = newString(len(s))
  601. var
  602. i = 0
  603. resultIndex = 0
  604. rune: Rune
  605. while i < len(s):
  606. fastRuneAt(s, i, rune, doInc = true)
  607. rune = runeProc(rune)
  608. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  609. proc toUpper*(s: string): string {.noSideEffect, procvar,
  610. rtl, extern: "nuc$1Str".} =
  611. ## Converts ``s`` into upper-case runes.
  612. runnableExamples:
  613. doAssert toUpper("abγ") == "ABΓ"
  614. convertRune(s, toUpper)
  615. proc toLower*(s: string): string {.noSideEffect, procvar,
  616. rtl, extern: "nuc$1Str".} =
  617. ## Converts ``s`` into lower-case runes.
  618. runnableExamples:
  619. doAssert toLower("ABΓ") == "abγ"
  620. convertRune(s, toLower)
  621. proc swapCase*(s: string): string {.noSideEffect, procvar,
  622. rtl, extern: "nuc$1".} =
  623. ## Swaps the case of runes in ``s``.
  624. ##
  625. ## Returns a new string such that the cases of all runes
  626. ## are swapped if possible.
  627. runnableExamples:
  628. doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
  629. var
  630. i = 0
  631. resultIndex = 0
  632. rune: Rune
  633. result = newString(len(s))
  634. while i < len(s):
  635. fastRuneAt(s, i, rune)
  636. if rune.isUpper():
  637. rune = rune.toLower()
  638. elif rune.isLower():
  639. rune = rune.toUpper()
  640. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  641. proc capitalize*(s: string): string {.noSideEffect, procvar,
  642. rtl, extern: "nuc$1".} =
  643. ## Converts the first character of ``s`` into an upper-case rune.
  644. runnableExamples:
  645. doAssert capitalize("βeta") == "Βeta"
  646. if len(s) == 0:
  647. return s
  648. var
  649. rune: Rune
  650. i = 0
  651. fastRuneAt(s, i, rune, doInc = true)
  652. result = $toUpper(rune) & substr(s, i)
  653. proc translate*(s: string, replacements: proc(key: string): string): string {.
  654. rtl, extern: "nuc$1".} =
  655. ## Translates words in a string using the ``replacements`` proc to substitute
  656. ## words inside ``s`` with their replacements.
  657. ##
  658. ## ``replacements`` is any proc that takes a word and returns
  659. ## a new word to fill it's place.
  660. runnableExamples:
  661. proc wordToNumber(s: string): string =
  662. case s
  663. of "one": "1"
  664. of "two": "2"
  665. else: s
  666. let a = "one two three four"
  667. doAssert a.translate(wordToNumber) == "1 2 three four"
  668. # Allocate memory for the new string based on the old one.
  669. # If the new string length is less than the old, no allocations
  670. # will be needed. If the new string length is greater than the
  671. # old, then maybe only one allocation is needed
  672. result = newStringOfCap(s.len)
  673. var
  674. index = 0
  675. lastIndex = 0
  676. wordStart = 0
  677. inWord = false
  678. rune: Rune
  679. while index < len(s):
  680. lastIndex = index
  681. fastRuneAt(s, index, rune)
  682. let whiteSpace = rune.isWhiteSpace()
  683. if whiteSpace and inWord:
  684. # If we've reached the end of a word
  685. let word = s[wordStart ..< lastIndex]
  686. result.add(replacements(word))
  687. result.add($rune)
  688. inWord = false
  689. elif not whiteSpace and not inWord:
  690. # If we've hit a non space character and
  691. # are not currently in a word, track
  692. # the starting index of the word
  693. inWord = true
  694. wordStart = lastIndex
  695. elif whiteSpace:
  696. result.add($rune)
  697. if wordStart < len(s) and inWord:
  698. # Get the trailing word at the end
  699. let word = s[wordStart .. ^1]
  700. result.add(replacements(word))
  701. proc title*(s: string): string {.noSideEffect, procvar,
  702. rtl, extern: "nuc$1".} =
  703. ## Converts ``s`` to a unicode title.
  704. ##
  705. ## Returns a new string such that the first character
  706. ## in each word inside ``s`` is capitalized.
  707. runnableExamples:
  708. doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
  709. var
  710. i = 0
  711. resultIndex = 0
  712. rune: Rune
  713. result = newString(len(s))
  714. var firstRune = true
  715. while i < len(s):
  716. fastRuneAt(s, i, rune)
  717. if not rune.isWhiteSpace() and firstRune:
  718. rune = rune.toUpper()
  719. firstRune = false
  720. elif rune.isWhiteSpace():
  721. firstRune = true
  722. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  723. iterator runes*(s: string): Rune =
  724. ## Iterates over any rune of the string ``s`` returning runes.
  725. var
  726. i = 0
  727. result: Rune
  728. while i < len(s):
  729. fastRuneAt(s, i, result, true)
  730. yield result
  731. iterator utf8*(s: string): string =
  732. ## Iterates over any rune of the string ``s`` returning utf8 values.
  733. ##
  734. ## See also:
  735. ## * `validateUtf8 proc <#validateUtf8,string>`_
  736. ## * `toUTF8 proc <#toUTF8,Rune>`_
  737. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  738. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  739. var o = 0
  740. while o < s.len:
  741. let n = runeLenAt(s, o)
  742. yield s[o .. (o+n-1)]
  743. o += n
  744. proc toRunes*(s: string): seq[Rune] =
  745. ## Obtains a sequence containing the Runes in ``s``.
  746. ##
  747. ## See also:
  748. ## * `$ proc <#$,seq[T][Rune]>`_ for a reverse operation
  749. runnableExamples:
  750. let a = toRunes("aáä")
  751. doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)]
  752. result = newSeq[Rune]()
  753. for r in s.runes:
  754. result.add(r)
  755. proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1", procvar.} =
  756. ## Compares two UTF-8 strings and ignores the case. Returns:
  757. ##
  758. ## | 0 if a == b
  759. ## | < 0 if a < b
  760. ## | > 0 if a > b
  761. var i = 0
  762. var j = 0
  763. var ar, br: Rune
  764. while i < a.len and j < b.len:
  765. # slow path:
  766. fastRuneAt(a, i, ar)
  767. fastRuneAt(b, j, br)
  768. result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br))
  769. if result != 0: return
  770. result = a.len - b.len
  771. proc reversed*(s: string): string =
  772. ## Returns the reverse of ``s``, interpreting it as runes.
  773. ##
  774. ## Unicode combining characters are correctly interpreted as well.
  775. runnableExamples:
  776. assert reversed("Reverse this!") == "!siht esreveR"
  777. assert reversed("先秦兩漢") == "漢兩秦先"
  778. assert reversed("as⃝df̅") == "f̅ds⃝a"
  779. assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  780. var
  781. i = 0
  782. lastI = 0
  783. newPos = len(s) - 1
  784. blockPos = 0
  785. r: Rune
  786. template reverseUntil(pos) =
  787. var j = pos - 1
  788. while j > blockPos:
  789. result[newPos] = s[j]
  790. dec j
  791. dec newPos
  792. blockPos = pos - 1
  793. result = newString(len(s))
  794. while i < len(s):
  795. lastI = i
  796. fastRuneAt(s, i, r, true)
  797. if not isCombining(r):
  798. reverseUntil(lastI)
  799. reverseUntil(len(s))
  800. proc graphemeLen*(s: string; i: Natural): Natural =
  801. ## The number of bytes belonging to byte index ``s[i]``,
  802. ## including following combining code unit.
  803. runnableExamples:
  804. let a = "añyóng"
  805. doAssert a.graphemeLen(1) == 2 ## ñ
  806. doAssert a.graphemeLen(2) == 1
  807. doAssert a.graphemeLen(4) == 2 ## ó
  808. var j = i.int
  809. var r, r2: Rune
  810. if j < s.len:
  811. fastRuneAt(s, j, r, true)
  812. result = j-i
  813. while j < s.len:
  814. fastRuneAt(s, j, r2, true)
  815. if not isCombining(r2): break
  816. result = j-i
  817. proc lastRune*(s: string; last: int): (Rune, int) =
  818. ## Length of the last rune in ``s[0..last]``. Returns the rune and its length
  819. ## in bytes.
  820. if s[last] <= chr(127):
  821. result = (Rune(s[last]), 1)
  822. else:
  823. var L = 0
  824. while last-L >= 0 and uint(s[last-L]) shr 6 == 0b10: inc(L)
  825. var r: Rune
  826. fastRuneAt(s, last-L, r, false)
  827. result = (r, L+1)
  828. proc size*(r: Rune): int {.noSideEffect.} =
  829. ## Returns the number of bytes the rune ``r`` takes.
  830. runnableExamples:
  831. let a = toRunes "aá"
  832. doAssert size(a[0]) == 1
  833. doAssert size(a[1]) == 2
  834. let v = r.uint32
  835. if v <= 0x007F'u32: result = 1
  836. elif v <= 0x07FF'u32: result = 2
  837. elif v <= 0xFFFF'u32: result = 3
  838. elif v <= 0x1FFFFF'u32: result = 4
  839. elif v <= 0x3FFFFFF'u32: result = 5
  840. elif v <= 0x7FFFFFFF'u32: result = 6
  841. else: result = 1
  842. # --------- Private templates for different split separators -----------
  843. proc stringHasSep(s: string, index: int, seps: openArray[Rune]): bool =
  844. var rune: Rune
  845. fastRuneAt(s, index, rune, false)
  846. return seps.contains(rune)
  847. proc stringHasSep(s: string, index: int, sep: Rune): bool =
  848. var rune: Rune
  849. fastRuneAt(s, index, rune, false)
  850. return sep == rune
  851. template splitCommon(s, sep, maxsplit: untyped, sepLen: int = -1) =
  852. ## Common code for split procedures.
  853. var
  854. last = 0
  855. splits = maxsplit
  856. if len(s) > 0:
  857. while last <= len(s):
  858. var first = last
  859. while last < len(s) and not stringHasSep(s, last, sep):
  860. when sep is Rune:
  861. inc(last, sepLen)
  862. else:
  863. inc(last, runeLenAt(s, last))
  864. if splits == 0: last = len(s)
  865. yield s[first .. (last - 1)]
  866. if splits == 0: break
  867. dec(splits)
  868. when sep is Rune:
  869. inc(last, sepLen)
  870. else:
  871. inc(last, if last < len(s): runeLenAt(s, last) else: 1)
  872. iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces,
  873. maxsplit: int = -1): string =
  874. ## Splits the unicode string ``s`` into substrings using a group of separators.
  875. ##
  876. ## Substrings are separated by a substring containing only ``seps``.
  877. ##
  878. ## .. code-block:: nim
  879. ## for word in split("this\lis an\texample"):
  880. ## writeLine(stdout, word)
  881. ##
  882. ## ...generates this output:
  883. ##
  884. ## .. code-block::
  885. ## "this"
  886. ## "is"
  887. ## "an"
  888. ## "example"
  889. ##
  890. ## And the following code:
  891. ##
  892. ## .. code-block:: nim
  893. ## for word in split("this:is;an$example", {';', ':', '$'}):
  894. ## writeLine(stdout, word)
  895. ##
  896. ## ...produces the same output as the first example. The code:
  897. ##
  898. ## .. code-block:: nim
  899. ## let date = "2012-11-20T22:08:08.398990"
  900. ## let separators = {' ', '-', ':', 'T'}
  901. ## for number in split(date, separators):
  902. ## writeLine(stdout, number)
  903. ##
  904. ## ...results in:
  905. ##
  906. ## .. code-block::
  907. ## "2012"
  908. ## "11"
  909. ## "20"
  910. ## "22"
  911. ## "08"
  912. ## "08.398990"
  913. ##
  914. splitCommon(s, seps, maxsplit)
  915. iterator splitWhitespace*(s: string): string =
  916. ## Splits a unicode string at whitespace runes.
  917. splitCommon(s, unicodeSpaces, -1)
  918. template accResult(iter: untyped) =
  919. result = @[]
  920. for x in iter: add(result, x)
  921. proc splitWhitespace*(s: string): seq[string] {.noSideEffect,
  922. rtl, extern: "ncuSplitWhitespace".} =
  923. ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_
  924. ## iterator, but is a proc that returns a sequence of substrings.
  925. accResult(splitWhitespace(s))
  926. iterator split*(s: string, sep: Rune, maxsplit: int = -1): string =
  927. ## Splits the unicode string ``s`` into substrings using a single separator.
  928. ##
  929. ## Substrings are separated by the rune ``sep``.
  930. ## The code:
  931. ##
  932. ## .. code-block:: nim
  933. ## for word in split(";;this;is;an;;example;;;", ';'):
  934. ## writeLine(stdout, word)
  935. ##
  936. ## Results in:
  937. ##
  938. ## .. code-block::
  939. ## ""
  940. ## ""
  941. ## "this"
  942. ## "is"
  943. ## "an"
  944. ## ""
  945. ## "example"
  946. ## ""
  947. ## ""
  948. ## ""
  949. ##
  950. splitCommon(s, sep, maxsplit, sep.size)
  951. proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1):
  952. seq[string] {.noSideEffect, rtl, extern: "nucSplitRunes".} =
  953. ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_,
  954. ## but is a proc that returns a sequence of substrings.
  955. accResult(split(s, seps, maxsplit))
  956. proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect,
  957. rtl, extern: "nucSplitRune".} =
  958. ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc
  959. ## that returns a sequence of substrings.
  960. accResult(split(s, sep, maxsplit))
  961. proc strip*(s: string, leading = true, trailing = true,
  962. runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect,
  963. rtl, extern: "nucStrip".} =
  964. ## Strips leading or trailing ``runes`` from ``s`` and returns
  965. ## the resulting string.
  966. ##
  967. ## If ``leading`` is true (default), leading ``runes`` are stripped.
  968. ## If ``trailing`` is true (default), trailing ``runes`` are stripped.
  969. ## If both are false, the string is returned unchanged.
  970. runnableExamples:
  971. let a = "\táñyóng "
  972. doAssert a.strip == "áñyóng"
  973. doAssert a.strip(leading = false) == "\táñyóng"
  974. doAssert a.strip(trailing = false) == "áñyóng "
  975. var
  976. sI = 0 ## starting index into string ``s``
  977. eI = len(s) - 1 ## ending index into ``s``, where the last ``Rune`` starts
  978. if leading:
  979. var
  980. i = 0
  981. xI: int ## value of ``sI`` at the beginning of the iteration
  982. rune: Rune
  983. while i < len(s):
  984. xI = i
  985. fastRuneAt(s, i, rune)
  986. sI = i # Assume to start from next rune
  987. if not runes.contains(rune):
  988. sI = xI # Go back to where the current rune starts
  989. break
  990. if trailing:
  991. var
  992. i = eI
  993. xI: int
  994. rune: Rune
  995. while i >= 0:
  996. xI = i
  997. fastRuneAt(s, xI, rune)
  998. var yI = i - 1
  999. while yI >= 0:
  1000. var
  1001. yIend = yI
  1002. pRune: Rune
  1003. fastRuneAt(s, yIend, pRune)
  1004. if yIend < xI: break
  1005. i = yI
  1006. rune = pRune
  1007. dec(yI)
  1008. if not runes.contains(rune):
  1009. eI = xI - 1
  1010. break
  1011. dec(i)
  1012. let newLen = eI - sI + 1
  1013. result = newStringOfCap(newLen)
  1014. if newLen > 0:
  1015. result.add s[sI .. eI]
  1016. proc repeat*(c: Rune, count: Natural): string {.noSideEffect,
  1017. rtl, extern: "nucRepeatRune".} =
  1018. ## Returns a string of ``count`` Runes ``c``.
  1019. ##
  1020. ## The returned string will have a rune-length of ``count``.
  1021. runnableExamples:
  1022. let a = "ñ".runeAt(0)
  1023. doAssert a.repeat(5) == "ñññññ"
  1024. let s = $c
  1025. result = newStringOfCap(count * s.len)
  1026. for i in 0 ..< count:
  1027. result.add s
  1028. proc align*(s: string, count: Natural, padding = ' '.Rune): string {.
  1029. noSideEffect, rtl, extern: "nucAlignString".} =
  1030. ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length
  1031. ## of ``count``.
  1032. ##
  1033. ## ``padding`` characters (by default spaces) are added before ``s`` resulting in
  1034. ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1035. ## returned unchanged. If you need to left align a string use the `alignLeft
  1036. ## proc <#alignLeft,string,Natural>`_.
  1037. runnableExamples:
  1038. assert align("abc", 4) == " abc"
  1039. assert align("a", 0) == "a"
  1040. assert align("1232", 6) == " 1232"
  1041. assert align("1232", 6, '#'.Rune) == "##1232"
  1042. assert align("Åge", 5) == " Åge"
  1043. assert align("×", 4, '_'.Rune) == "___×"
  1044. let sLen = s.runeLen
  1045. if sLen < count:
  1046. let padStr = $padding
  1047. result = newStringOfCap(padStr.len * count)
  1048. let spaces = count - sLen
  1049. for i in 0 ..< spaces: result.add padStr
  1050. result.add s
  1051. else:
  1052. result = s
  1053. proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {.
  1054. noSideEffect.} =
  1055. ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a
  1056. ## rune-length of ``count``.
  1057. ##
  1058. ## ``padding`` characters (by default spaces) are added after ``s`` resulting in
  1059. ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1060. ## returned unchanged. If you need to right align a string use the `align
  1061. ## proc <#align,string,Natural>`_.
  1062. runnableExamples:
  1063. assert alignLeft("abc", 4) == "abc "
  1064. assert alignLeft("a", 0) == "a"
  1065. assert alignLeft("1232", 6) == "1232 "
  1066. assert alignLeft("1232", 6, '#'.Rune) == "1232##"
  1067. assert alignLeft("Åge", 5) == "Åge "
  1068. assert alignLeft("×", 4, '_'.Rune) == "×___"
  1069. let sLen = s.runeLen
  1070. if sLen < count:
  1071. let padStr = $padding
  1072. result = newStringOfCap(s.len + (count - sLen) * padStr.len)
  1073. result.add s
  1074. for i in sLen ..< count:
  1075. result.add padStr
  1076. else:
  1077. result = s
  1078. # -----------------------------------------------------------------------------
  1079. # deprecated
  1080. template runeCaseCheck(s, runeProc, skipNonAlpha) =
  1081. ## Common code for rune.isLower and rune.isUpper.
  1082. if len(s) == 0: return false
  1083. var
  1084. i = 0
  1085. rune: Rune
  1086. hasAtleastOneAlphaRune = false
  1087. while i < len(s):
  1088. fastRuneAt(s, i, rune, doInc = true)
  1089. if skipNonAlpha:
  1090. var runeIsAlpha = isAlpha(rune)
  1091. if not hasAtleastOneAlphaRune:
  1092. hasAtleastOneAlphaRune = runeIsAlpha
  1093. if runeIsAlpha and (not runeProc(rune)):
  1094. return false
  1095. else:
  1096. if not runeProc(rune):
  1097. return false
  1098. return if skipNonAlpha: hasAtleastOneAlphaRune else: true
  1099. proc isLower*(s: string, skipNonAlpha: bool): bool {.
  1100. deprecated: "Deprecated since version 0.20 since its semantics are unclear".} =
  1101. ## **Deprecated since version 0.20 since its semantics are unclear**
  1102. ##
  1103. ## Checks whether ``s`` is lower case.
  1104. ##
  1105. ## If ``skipNonAlpha`` is true, returns true if all alphabetical
  1106. ## runes in ``s`` are lower case. Returns false if none of the
  1107. ## runes in ``s`` are alphabetical.
  1108. ##
  1109. ## If ``skipNonAlpha`` is false, returns true only if all runes in
  1110. ## ``s`` are alphabetical and lower case.
  1111. ##
  1112. ## For either value of ``skipNonAlpha``, returns false if ``s`` is
  1113. ## an empty string.
  1114. runeCaseCheck(s, isLower, skipNonAlpha)
  1115. proc isUpper*(s: string, skipNonAlpha: bool): bool {.
  1116. deprecated: "Deprecated since version 0.20 since its semantics are unclear".} =
  1117. ## **Deprecated since version 0.20 since its semantics are unclear**
  1118. ##
  1119. ## Checks whether ``s`` is upper case.
  1120. ##
  1121. ## If ``skipNonAlpha`` is true, returns true if all alphabetical
  1122. ## runes in ``s`` are upper case. Returns false if none of the
  1123. ## runes in ``s`` are alphabetical.
  1124. ##
  1125. ## If ``skipNonAlpha`` is false, returns true only if all runes in
  1126. ## ``s`` are alphabetical and upper case.
  1127. ##
  1128. ## For either value of ``skipNonAlpha``, returns false if ``s`` is
  1129. ## an empty string.
  1130. runeCaseCheck(s, isUpper, skipNonAlpha)
  1131. proc isTitle*(s: string): bool {.noSideEffect, procvar, rtl, extern: "nuc$1Str",
  1132. deprecated: "Deprecated since version 0.20 since its semantics are unclear".} =
  1133. ## **Deprecated since version 0.20 since its semantics are unclear**
  1134. ##
  1135. ## Checks whether or not ``s`` is a unicode title.
  1136. ##
  1137. ## Returns true if the first character in each word inside ``s``
  1138. ## are upper case and there is at least one character in ``s``.
  1139. if s.len == 0:
  1140. return false
  1141. result = true
  1142. var
  1143. i = 0
  1144. rune: Rune
  1145. var firstRune = true
  1146. while i < len(s) and result:
  1147. fastRuneAt(s, i, rune, doInc = true)
  1148. if not rune.isWhiteSpace() and firstRune:
  1149. result = rune.isUpper() and result
  1150. firstRune = false
  1151. elif rune.isWhiteSpace():
  1152. firstRune = true
  1153. when isMainModule:
  1154. proc asRune(s: static[string]): Rune =
  1155. ## Compile-time conversion proc for converting string literals to a Rune
  1156. ## value. Returns the first Rune of the specified string.
  1157. ##
  1158. ## Shortcuts code like ``"å".runeAt(0)`` to ``"å".asRune`` and returns a
  1159. ## compile-time constant.
  1160. if s.len == 0: Rune(0)
  1161. else: s.runeAt(0)
  1162. let
  1163. someString = "öÑ"
  1164. someRunes = toRunes(someString)
  1165. compared = (someString == $someRunes)
  1166. doAssert compared == true
  1167. proc testReplacements(word: string): string =
  1168. case word
  1169. of "two":
  1170. return "2"
  1171. of "foo":
  1172. return "BAR"
  1173. of "βeta":
  1174. return "beta"
  1175. of "alpha":
  1176. return "αlpha"
  1177. else:
  1178. return "12345"
  1179. doAssert translate("two not alpha foo βeta", testReplacements) == "2 12345 αlpha BAR beta"
  1180. doAssert translate(" two not foo βeta ", testReplacements) == " 2 12345 BAR beta "
  1181. doAssert title("foo bar") == "Foo Bar"
  1182. doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
  1183. doAssert title("") == ""
  1184. doAssert capitalize("βeta") == "Βeta"
  1185. doAssert capitalize("foo") == "Foo"
  1186. doAssert capitalize("") == ""
  1187. doAssert swapCase("FooBar") == "fOObAR"
  1188. doAssert swapCase(" ") == " "
  1189. doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
  1190. doAssert swapCase("a✓B") == "A✓b"
  1191. doAssert swapCase("Јамогујестистаклоитоминештети") == "јАМОГУЈЕСТИСТАКЛОИТОМИНЕШТЕТИ"
  1192. doAssert swapCase("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει") == "ὝΑΛΟΝΦΑΓΕῖΝΔΎΝΑΜΑΙΤΟῦΤΟΟὔΜΕΒΛΆΠΤΕΙ"
  1193. doAssert swapCase("Կրնամապակիուտեևինծիանհանգիստչըներ") == "կՐՆԱՄԱՊԱԿԻՈՒՏԵևԻՆԾԻԱՆՀԱՆԳԻՍՏՉԸՆԵՐ"
  1194. doAssert swapCase("") == ""
  1195. doAssert isAlpha("r")
  1196. doAssert isAlpha("α")
  1197. doAssert isAlpha("ϙ")
  1198. doAssert isAlpha("ஶ")
  1199. doAssert(not isAlpha("$"))
  1200. doAssert(not isAlpha(""))
  1201. doAssert isAlpha("Βeta")
  1202. doAssert isAlpha("Args")
  1203. doAssert isAlpha("𐌼𐌰𐌲𐌲𐌻𐌴𐍃𐍄𐌰𐌽")
  1204. doAssert isAlpha("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει")
  1205. doAssert isAlpha("Јамогујестистаклоитоминештети")
  1206. doAssert isAlpha("Կրնամապակիուտեևինծիանհանգիստչըներ")
  1207. doAssert(not isAlpha("$Foo✓"))
  1208. doAssert(not isAlpha("⠙⠕⠑⠎⠝⠞"))
  1209. doAssert isSpace("\t")
  1210. doAssert isSpace("\l")
  1211. doAssert(not isSpace("Β"))
  1212. doAssert(not isSpace("Βeta"))
  1213. doAssert isSpace("\t\l \v\r\f")
  1214. doAssert isSpace(" ")
  1215. doAssert(not isSpace(""))
  1216. doAssert(not isSpace("ΑΓc \td"))
  1217. doAssert(not isLower(' '.Rune))
  1218. doAssert(not isUpper(' '.Rune))
  1219. doAssert toUpper("Γ") == "Γ"
  1220. doAssert toUpper("b") == "B"
  1221. doAssert toUpper("α") == "Α"
  1222. doAssert toUpper("✓") == "✓"
  1223. doAssert toUpper("ϙ") == "Ϙ"
  1224. doAssert toUpper("") == ""
  1225. doAssert toUpper("ΑΒΓ") == "ΑΒΓ"
  1226. doAssert toUpper("AAccβ") == "AACCΒ"
  1227. doAssert toUpper("A✓$β") == "A✓$Β"
  1228. doAssert toLower("a") == "a"
  1229. doAssert toLower("γ") == "γ"
  1230. doAssert toLower("Γ") == "γ"
  1231. doAssert toLower("4") == "4"
  1232. doAssert toLower("Ϙ") == "ϙ"
  1233. doAssert toLower("") == ""
  1234. doAssert toLower("abcdγ") == "abcdγ"
  1235. doAssert toLower("abCDΓ") == "abcdγ"
  1236. doAssert toLower("33aaΓ") == "33aaγ"
  1237. doAssert reversed("Reverse this!") == "!siht esreveR"
  1238. doAssert reversed("先秦兩漢") == "漢兩秦先"
  1239. doAssert reversed("as⃝df̅") == "f̅ds⃝a"
  1240. doAssert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  1241. doAssert reversed("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει") == "ιετπάλβεμὔοοτῦοτιαμανύδνῖεγαϕνολαὕ"
  1242. doAssert reversed("Јамогујестистаклоитоминештети") == "итетшенимотиолкатситсејугомаЈ"
  1243. doAssert reversed("Կրնամապակիուտեևինծիանհանգիստչըներ") == "րենըչտսիգնահնաիծնիևետւոիկապամանրԿ"
  1244. doAssert len(toRunes("as⃝df̅")) == runeLen("as⃝df̅")
  1245. const test = "as⃝"
  1246. doAssert lastRune(test, test.len-1)[1] == 3
  1247. doAssert graphemeLen("è", 0) == 2
  1248. # test for rune positioning and runeSubStr()
  1249. let s = "Hänsel ««: 10,00€"
  1250. var t = ""
  1251. for c in s.utf8:
  1252. t.add c
  1253. doAssert(s == t)
  1254. doAssert(runeReverseOffset(s, 1) == (20, 18))
  1255. doAssert(runeReverseOffset(s, 19) == (-1, 18))
  1256. doAssert(runeStrAtPos(s, 0) == "H")
  1257. doAssert(runeSubStr(s, 0, 1) == "H")
  1258. doAssert(runeStrAtPos(s, 10) == ":")
  1259. doAssert(runeSubStr(s, 10, 1) == ":")
  1260. doAssert(runeStrAtPos(s, 9) == "«")
  1261. doAssert(runeSubStr(s, 9, 1) == "«")
  1262. doAssert(runeStrAtPos(s, 17) == "€")
  1263. doAssert(runeSubStr(s, 17, 1) == "€")
  1264. # echo runeStrAtPos(s, 18) # index error
  1265. doAssert(runeSubStr(s, 0) == "Hänsel ««: 10,00€")
  1266. doAssert(runeSubStr(s, -18) == "Hänsel ««: 10,00€")
  1267. doAssert(runeSubStr(s, 10) == ": 10,00€")
  1268. doAssert(runeSubStr(s, 18) == "")
  1269. doAssert(runeSubStr(s, 0, 10) == "Hänsel ««")
  1270. doAssert(runeSubStr(s, 12) == "10,00€")
  1271. doAssert(runeSubStr(s, -6) == "10,00€")
  1272. doAssert(runeSubStr(s, 12, 5) == "10,00")
  1273. doAssert(runeSubStr(s, 12, -1) == "10,00")
  1274. doAssert(runeSubStr(s, -6, 5) == "10,00")
  1275. doAssert(runeSubStr(s, -6, -1) == "10,00")
  1276. doAssert(runeSubStr(s, 0, 100) == "Hänsel ««: 10,00€")
  1277. doAssert(runeSubStr(s, -100, 100) == "Hänsel ««: 10,00€")
  1278. doAssert(runeSubStr(s, 0, -100) == "")
  1279. doAssert(runeSubStr(s, 100, -100) == "")
  1280. block splitTests:
  1281. let s = " this is an example "
  1282. let s2 = ":this;is;an:example;;"
  1283. let s3 = ":this×is×an:example××"
  1284. doAssert s.split() == @["", "this", "is", "an", "example", "", ""]
  1285. doAssert s2.split(seps = [':'.Rune, ';'.Rune]) == @["", "this", "is", "an",
  1286. "example", "", ""]
  1287. doAssert s3.split(seps = [':'.Rune, "×".asRune]) == @["", "this", "is",
  1288. "an", "example", "", ""]
  1289. doAssert s.split(maxsplit = 4) == @["", "this", "is", "an", "example "]
  1290. doAssert s.split(' '.Rune, maxsplit = 1) == @["", "this is an example "]
  1291. block stripTests:
  1292. doAssert(strip("") == "")
  1293. doAssert(strip(" ") == "")
  1294. doAssert(strip("y") == "y")
  1295. doAssert(strip(" foofoofoo ") == "foofoofoo")
  1296. doAssert(strip("sfoofoofoos", runes = ['s'.Rune]) == "foofoofoo")
  1297. block:
  1298. let stripTestRunes = ['b'.Rune, 'a'.Rune, 'r'.Rune]
  1299. doAssert(strip("barfoofoofoobar", runes = stripTestRunes) == "foofoofoo")
  1300. doAssert(strip("sfoofoofoos", leading = false, runes = ['s'.Rune]) == "sfoofoofoo")
  1301. doAssert(strip("sfoofoofoos", trailing = false, runes = ['s'.Rune]) == "foofoofoos")
  1302. block:
  1303. let stripTestRunes = ["«".asRune, "»".asRune]
  1304. doAssert(strip("«TEXT»", runes = stripTestRunes) == "TEXT")
  1305. doAssert(strip("copyright©", leading = false, runes = ["©".asRune]) == "copyright")
  1306. doAssert(strip("¿Question?", trailing = false, runes = ["¿".asRune]) == "Question?")
  1307. doAssert(strip("×text×", leading = false, runes = ["×".asRune]) == "×text")
  1308. doAssert(strip("×text×", trailing = false, runes = ["×".asRune]) == "text×")
  1309. block repeatTests:
  1310. doAssert repeat('c'.Rune, 5) == "ccccc"
  1311. doAssert repeat("×".asRune, 5) == "×××××"
  1312. block alignTests:
  1313. doAssert align("abc", 4) == " abc"
  1314. doAssert align("a", 0) == "a"
  1315. doAssert align("1232", 6) == " 1232"
  1316. doAssert align("1232", 6, '#'.Rune) == "##1232"
  1317. doAssert align("1232", 6, "×".asRune) == "××1232"
  1318. doAssert alignLeft("abc", 4) == "abc "
  1319. doAssert alignLeft("a", 0) == "a"
  1320. doAssert alignLeft("1232", 6) == "1232 "
  1321. doAssert alignLeft("1232", 6, '#'.Rune) == "1232##"
  1322. doAssert alignLeft("1232", 6, "×".asRune) == "1232××"
  1323. block differentSizes:
  1324. # upper and lower variants have different number of bytes
  1325. doAssert toLower("AẞC") == "aßc"
  1326. doAssert toLower("ȺẞCD") == "ⱥßcd"
  1327. doAssert toUpper("ⱥbc") == "ȺBC"
  1328. doAssert toUpper("rsⱦuv") == "RSȾUV"
  1329. doAssert swapCase("ⱥbCd") == "ȺBcD"
  1330. doAssert swapCase("XyꟆaB") == "xYᶎAb"
  1331. doAssert swapCase("aᵹcᲈd") == "AꝽCꙊD"