unicode.nim 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module provides support to handle the Unicode UTF-8 encoding.
  10. ##
  11. ## There are no specialized ``insert``, ``delete``, ``add`` and ``contains``
  12. ## procedures for ``seq[Rune]`` in this module because the generic variants
  13. ## of these procedures in the system module already work with it.
  14. ##
  15. ## The current version is compatible with Unicode v12.0.0.
  16. ##
  17. ## **See also:**
  18. ## * `strutils module <strutils.html>`_
  19. ## * `unidecode module <unidecode.html>`_
  20. ## * `encodings module <encodings.html>`_
  21. include "system/inclrtl"
  22. import std/strbasics
  23. template toOa(s: string): auto = s.toOpenArray(0, s.high)
  24. proc substr(s: openArray[char] , first, last: int): string =
  25. # Copied substr from system
  26. let first = max(first, 0)
  27. let L = max(min(last, high(s)) - first + 1, 0)
  28. result = newString(L)
  29. for i in 0 .. L-1:
  30. result[i] = s[i+first]
  31. type
  32. RuneImpl = int32 # underlying type of Rune
  33. Rune* = distinct RuneImpl ## \
  34. ## Type that can hold a single Unicode code point.
  35. ##
  36. ## A Rune may be composed with other Runes to a character on the screen.
  37. ## `RuneImpl` is the underlying type used to store Runes, currently `int32`.
  38. template ones(n: untyped): untyped = ((1 shl n)-1)
  39. proc runeLen*(s: openArray[char]): int {.rtl, extern: "nuc$1".} =
  40. ## Returns the number of runes of the string ``s``.
  41. runnableExamples:
  42. let a = "añyóng"
  43. doAssert a.runeLen == 6
  44. ## note: a.len == 8
  45. result = 0
  46. var i = 0
  47. while i < len(s):
  48. if uint(s[i]) <= 127: inc(i)
  49. elif uint(s[i]) shr 5 == 0b110: inc(i, 2)
  50. elif uint(s[i]) shr 4 == 0b1110: inc(i, 3)
  51. elif uint(s[i]) shr 3 == 0b11110: inc(i, 4)
  52. elif uint(s[i]) shr 2 == 0b111110: inc(i, 5)
  53. elif uint(s[i]) shr 1 == 0b1111110: inc(i, 6)
  54. else: inc i
  55. inc(result)
  56. proc runeLenAt*(s: openArray[char], i: Natural): int =
  57. ## Returns the number of bytes the rune starting at ``s[i]`` takes.
  58. ##
  59. ## See also:
  60. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  61. runnableExamples:
  62. let a = "añyóng"
  63. doAssert a.runeLenAt(0) == 1
  64. doAssert a.runeLenAt(1) == 2
  65. if uint(s[i]) <= 127: result = 1
  66. elif uint(s[i]) shr 5 == 0b110: result = 2
  67. elif uint(s[i]) shr 4 == 0b1110: result = 3
  68. elif uint(s[i]) shr 3 == 0b11110: result = 4
  69. elif uint(s[i]) shr 2 == 0b111110: result = 5
  70. elif uint(s[i]) shr 1 == 0b1111110: result = 6
  71. else: result = 1
  72. const replRune = Rune(0xFFFD)
  73. template fastRuneAt*(s: openArray[char] or string, i: int, result: untyped, doInc = true) =
  74. ## Returns the rune ``s[i]`` in ``result``.
  75. ##
  76. ## If ``doInc == true`` (default), ``i`` is incremented by the number
  77. ## of bytes that have been processed.
  78. bind ones
  79. if uint(s[i]) <= 127:
  80. result = Rune(uint(s[i]))
  81. when doInc: inc(i)
  82. elif uint(s[i]) shr 5 == 0b110:
  83. # assert(uint(s[i+1]) shr 6 == 0b10)
  84. if i <= s.len - 2:
  85. result = Rune((uint(s[i]) and (ones(5))) shl 6 or
  86. (uint(s[i+1]) and ones(6)))
  87. when doInc: inc(i, 2)
  88. else:
  89. result = replRune
  90. when doInc: inc(i)
  91. elif uint(s[i]) shr 4 == 0b1110:
  92. # assert(uint(s[i+1]) shr 6 == 0b10)
  93. # assert(uint(s[i+2]) shr 6 == 0b10)
  94. if i <= s.len - 3:
  95. result = Rune((uint(s[i]) and ones(4)) shl 12 or
  96. (uint(s[i+1]) and ones(6)) shl 6 or
  97. (uint(s[i+2]) and ones(6)))
  98. when doInc: inc(i, 3)
  99. else:
  100. result = replRune
  101. when doInc: inc(i)
  102. elif uint(s[i]) shr 3 == 0b11110:
  103. # assert(uint(s[i+1]) shr 6 == 0b10)
  104. # assert(uint(s[i+2]) shr 6 == 0b10)
  105. # assert(uint(s[i+3]) shr 6 == 0b10)
  106. if i <= s.len - 4:
  107. result = Rune((uint(s[i]) and ones(3)) shl 18 or
  108. (uint(s[i+1]) and ones(6)) shl 12 or
  109. (uint(s[i+2]) and ones(6)) shl 6 or
  110. (uint(s[i+3]) and ones(6)))
  111. when doInc: inc(i, 4)
  112. else:
  113. result = replRune
  114. when doInc: inc(i)
  115. elif uint(s[i]) shr 2 == 0b111110:
  116. # assert(uint(s[i+1]) shr 6 == 0b10)
  117. # assert(uint(s[i+2]) shr 6 == 0b10)
  118. # assert(uint(s[i+3]) shr 6 == 0b10)
  119. # assert(uint(s[i+4]) shr 6 == 0b10)
  120. if i <= s.len - 5:
  121. result = Rune((uint(s[i]) and ones(2)) shl 24 or
  122. (uint(s[i+1]) and ones(6)) shl 18 or
  123. (uint(s[i+2]) and ones(6)) shl 12 or
  124. (uint(s[i+3]) and ones(6)) shl 6 or
  125. (uint(s[i+4]) and ones(6)))
  126. when doInc: inc(i, 5)
  127. else:
  128. result = replRune
  129. when doInc: inc(i)
  130. elif uint(s[i]) shr 1 == 0b1111110:
  131. # assert(uint(s[i+1]) shr 6 == 0b10)
  132. # assert(uint(s[i+2]) shr 6 == 0b10)
  133. # assert(uint(s[i+3]) shr 6 == 0b10)
  134. # assert(uint(s[i+4]) shr 6 == 0b10)
  135. # assert(uint(s[i+5]) shr 6 == 0b10)
  136. if i <= s.len - 6:
  137. result = Rune((uint(s[i]) and ones(1)) shl 30 or
  138. (uint(s[i+1]) and ones(6)) shl 24 or
  139. (uint(s[i+2]) and ones(6)) shl 18 or
  140. (uint(s[i+3]) and ones(6)) shl 12 or
  141. (uint(s[i+4]) and ones(6)) shl 6 or
  142. (uint(s[i+5]) and ones(6)))
  143. when doInc: inc(i, 6)
  144. else:
  145. result = replRune
  146. when doInc: inc(i)
  147. else:
  148. result = Rune(uint(s[i]))
  149. when doInc: inc(i)
  150. proc runeAt*(s: openArray[char], i: Natural): Rune =
  151. ## Returns the rune in ``s`` at **byte index** ``i``.
  152. ##
  153. ## See also:
  154. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  155. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  156. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  157. runnableExamples:
  158. let a = "añyóng"
  159. doAssert a.runeAt(1) == "ñ".runeAt(0)
  160. doAssert a.runeAt(2) == "ñ".runeAt(1)
  161. doAssert a.runeAt(3) == "y".runeAt(0)
  162. fastRuneAt(s, i, result, false)
  163. proc validateUtf8*(s: openArray[char]): int =
  164. ## Returns the position of the invalid byte in ``s`` if the string ``s`` does
  165. ## not hold valid UTF-8 data. Otherwise ``-1`` is returned.
  166. ##
  167. ## See also:
  168. ## * `toUTF8 proc <#toUTF8,Rune>`_
  169. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  170. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  171. var i = 0
  172. let L = s.len
  173. while i < L:
  174. if uint(s[i]) <= 127:
  175. inc(i)
  176. elif uint(s[i]) shr 5 == 0b110:
  177. if uint(s[i]) < 0xc2: return i # Catch overlong ascii representations.
  178. if i+1 < L and uint(s[i+1]) shr 6 == 0b10: inc(i, 2)
  179. else: return i
  180. elif uint(s[i]) shr 4 == 0b1110:
  181. if i+2 < L and uint(s[i+1]) shr 6 == 0b10 and uint(s[i+2]) shr 6 == 0b10:
  182. inc i, 3
  183. else: return i
  184. elif uint(s[i]) shr 3 == 0b11110:
  185. if i+3 < L and uint(s[i+1]) shr 6 == 0b10 and
  186. uint(s[i+2]) shr 6 == 0b10 and
  187. uint(s[i+3]) shr 6 == 0b10:
  188. inc i, 4
  189. else: return i
  190. else:
  191. return i
  192. return -1
  193. template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) =
  194. ## Copies UTF-8 representation of ``c`` into the preallocated string ``s``
  195. ## starting at position ``pos``.
  196. ##
  197. ## If ``doInc == true`` (default), ``pos`` is incremented
  198. ## by the number of bytes that have been processed.
  199. ##
  200. ## To be the most efficient, make sure ``s`` is preallocated
  201. ## with an additional amount equal to the byte length of ``c``.
  202. ##
  203. ## See also:
  204. ## * `validateUtf8 proc <#validateUtf8,string>`_
  205. ## * `toUTF8 proc <#toUTF8,Rune>`_
  206. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  207. var i = RuneImpl(c)
  208. if i <=% 127:
  209. s.setLen(pos+1)
  210. s[pos+0] = chr(i)
  211. when doInc: inc(pos)
  212. elif i <=% 0x07FF:
  213. s.setLen(pos+2)
  214. s[pos+0] = chr((i shr 6) or 0b110_00000)
  215. s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
  216. when doInc: inc(pos, 2)
  217. elif i <=% 0xFFFF:
  218. s.setLen(pos+3)
  219. s[pos+0] = chr(i shr 12 or 0b1110_0000)
  220. s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  221. s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
  222. when doInc: inc(pos, 3)
  223. elif i <=% 0x001FFFFF:
  224. s.setLen(pos+4)
  225. s[pos+0] = chr(i shr 18 or 0b1111_0000)
  226. s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  227. s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  228. s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
  229. when doInc: inc(pos, 4)
  230. elif i <=% 0x03FFFFFF:
  231. s.setLen(pos+5)
  232. s[pos+0] = chr(i shr 24 or 0b111110_00)
  233. s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  234. s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  235. s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  236. s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
  237. when doInc: inc(pos, 5)
  238. elif i <=% 0x7FFFFFFF:
  239. s.setLen(pos+6)
  240. s[pos+0] = chr(i shr 30 or 0b1111110_0)
  241. s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
  242. s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  243. s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  244. s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  245. s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
  246. when doInc: inc(pos, 6)
  247. else:
  248. discard # error, exception?
  249. proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} =
  250. ## Converts a rune into its UTF-8 representation.
  251. ##
  252. ## See also:
  253. ## * `validateUtf8 proc <#validateUtf8,string>`_
  254. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  255. ## * `utf8 iterator <#utf8.i,string>`_
  256. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  257. runnableExamples:
  258. let a = "añyóng"
  259. doAssert a.runeAt(1).toUTF8 == "ñ"
  260. result = ""
  261. fastToUTF8Copy(c, result, 0, false)
  262. proc add*(s: var string; c: Rune) =
  263. ## Adds a rune ``c`` to a string ``s``.
  264. runnableExamples:
  265. var s = "abc"
  266. let c = "ä".runeAt(0)
  267. s.add(c)
  268. doAssert s == "abcä"
  269. let pos = s.len
  270. fastToUTF8Copy(c, s, pos, false)
  271. proc `$`*(rune: Rune): string =
  272. ## An alias for `toUTF8 <#toUTF8,Rune>`_.
  273. ##
  274. ## See also:
  275. ## * `validateUtf8 proc <#validateUtf8,string>`_
  276. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  277. rune.toUTF8
  278. proc `$`*(runes: seq[Rune]): string =
  279. ## Converts a sequence of Runes to a string.
  280. ##
  281. ## See also:
  282. ## * `toRunes <#toRunes,string>`_ for a reverse operation
  283. runnableExamples:
  284. let
  285. someString = "öÑ"
  286. someRunes = toRunes(someString)
  287. doAssert $someRunes == someString
  288. result = ""
  289. for rune in runes:
  290. result.add rune
  291. proc runeOffset*(s: openArray[char], pos: Natural, start: Natural = 0): int =
  292. ## Returns the byte position of rune
  293. ## at position ``pos`` in ``s`` with an optional start byte position.
  294. ## Returns the special value -1 if it runs out of the string.
  295. ##
  296. ## **Beware:** This can lead to unoptimized code and slow execution!
  297. ## Most problems can be solved more efficiently by using an iterator
  298. ## or conversion to a seq of Rune.
  299. ##
  300. ## See also:
  301. ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_
  302. runnableExamples:
  303. let a = "añyóng"
  304. doAssert a.runeOffset(1) == 1
  305. doAssert a.runeOffset(3) == 4
  306. doAssert a.runeOffset(4) == 6
  307. var
  308. i = 0
  309. o = start
  310. while i < pos:
  311. o += runeLenAt(s, o)
  312. if o >= s.len:
  313. return -1
  314. inc i
  315. return o
  316. proc runeReverseOffset*(s: openArray[char], rev: Positive): (int, int) =
  317. ## Returns a tuple with the byte offset of the
  318. ## rune at position ``rev`` in ``s``, counting
  319. ## from the end (starting with 1) and the total
  320. ## number of runes in the string.
  321. ##
  322. ## Returns a negative value for offset if there are too few runes in
  323. ## the string to satisfy the request.
  324. ##
  325. ## **Beware:** This can lead to unoptimized code and slow execution!
  326. ## Most problems can be solved more efficiently by using an iterator
  327. ## or conversion to a seq of Rune.
  328. ##
  329. ## See also:
  330. ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_
  331. var
  332. a = rev.int
  333. o = 0
  334. x = 0
  335. let times = 2*rev.int-s.runeLen # transformed from rev.int - a < s.runeLen - rev.int
  336. while o < s.len:
  337. let r = runeLenAt(s, o)
  338. o += r
  339. if a > times:
  340. x += r
  341. dec a
  342. result = if a > 0: (-a, rev.int-a) else: (x, -a+rev.int)
  343. proc runeAtPos*(s: openArray[char], pos: int): Rune =
  344. ## Returns the rune at position ``pos``.
  345. ##
  346. ## **Beware:** This can lead to unoptimized code and slow execution!
  347. ## Most problems can be solved more efficiently by using an iterator
  348. ## or conversion to a seq of Rune.
  349. ##
  350. ## See also:
  351. ## * `runeAt proc <#runeAt,string,Natural>`_
  352. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  353. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  354. fastRuneAt(s, runeOffset(s, pos), result, false)
  355. proc runeStrAtPos*(s: openArray[char], pos: Natural): string =
  356. ## Returns the rune at position ``pos`` as UTF8 String.
  357. ##
  358. ## **Beware:** This can lead to unoptimized code and slow execution!
  359. ## Most problems can be solved more efficiently by using an iterator
  360. ## or conversion to a seq of Rune.
  361. ##
  362. ## See also:
  363. ## * `runeAt proc <#runeAt,string,Natural>`_
  364. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  365. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  366. let o = runeOffset(s, pos)
  367. substr(s.toOpenArray(o, (o+runeLenAt(s, o)-1)))
  368. proc runeSubStr*(s: openArray[char], pos: int, len: int = int.high): string =
  369. ## Returns the UTF-8 substring starting at code point ``pos``
  370. ## with ``len`` code points.
  371. ##
  372. ## If ``pos`` or ``len`` is negative they count from
  373. ## the end of the string. If ``len`` is not given it means the longest
  374. ## possible string.
  375. runnableExamples:
  376. let s = "Hänsel ««: 10,00€"
  377. doAssert(runeSubStr(s, 0, 2) == "Hä")
  378. doAssert(runeSubStr(s, 10, 1) == ":")
  379. doAssert(runeSubStr(s, -6) == "10,00€")
  380. doAssert(runeSubStr(s, 10) == ": 10,00€")
  381. doAssert(runeSubStr(s, 12, 5) == "10,00")
  382. doAssert(runeSubStr(s, -6, 3) == "10,")
  383. if pos < 0:
  384. let (o, rl) = runeReverseOffset(s, -pos)
  385. if len >= rl:
  386. result = s.substr(o, s.high)
  387. elif len < 0:
  388. let e = rl + len
  389. if e < 0:
  390. result = ""
  391. else:
  392. result = s.substr(o, runeOffset(s, e-(rl+pos), o)-1)
  393. else:
  394. result = s.substr(o, runeOffset(s, len, o)-1)
  395. else:
  396. let o = runeOffset(s, pos)
  397. if o < 0:
  398. result = ""
  399. elif len == int.high:
  400. result = s.substr(o, s.len-1)
  401. elif len < 0:
  402. let (e, rl) = runeReverseOffset(s, -len)
  403. discard rl
  404. if e <= 0:
  405. result = ""
  406. else:
  407. result = s.substr(o, e-1)
  408. else:
  409. var e = runeOffset(s, len, o)
  410. if e < 0:
  411. e = s.len
  412. result = s.substr(o, e-1)
  413. proc `<=%`*(a, b: Rune): bool =
  414. ## Checks if code point of `a` is smaller or equal to code point of `b`.
  415. runnableExamples:
  416. let
  417. a = "ú".runeAt(0)
  418. b = "ü".runeAt(0)
  419. doAssert a <=% b
  420. return int(a) <=% int(b)
  421. proc `<%`*(a, b: Rune): bool =
  422. ## Checks if code point of `a` is smaller than code point of `b`.
  423. runnableExamples:
  424. let
  425. a = "ú".runeAt(0)
  426. b = "ü".runeAt(0)
  427. doAssert a <% b
  428. return int(a) <% int(b)
  429. proc `==`*(a, b: Rune): bool =
  430. ## Checks if two runes are equal.
  431. return int(a) == int(b)
  432. include "includes/unicode_ranges"
  433. proc binarySearch(c: RuneImpl, tab: openArray[int32], len, stride: int): int =
  434. var n = len
  435. var t = 0
  436. while n > 1:
  437. var m = n div 2
  438. var p = t + m*stride
  439. if c >= tab[p]:
  440. t = p
  441. n = n-m
  442. else:
  443. n = m
  444. if n != 0 and c >= tab[t]:
  445. return t
  446. return -1
  447. proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
  448. ## Converts ``c`` into lower case. This works for any rune.
  449. ##
  450. ## If possible, prefer ``toLower`` over ``toUpper``.
  451. ##
  452. ## See also:
  453. ## * `toUpper proc <#toUpper,Rune>`_
  454. ## * `toTitle proc <#toTitle,Rune>`_
  455. ## * `isLower proc <#isLower,Rune>`_
  456. var c = RuneImpl(c)
  457. var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3)
  458. if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]:
  459. return Rune(c + toLowerRanges[p+2] - 500)
  460. p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2)
  461. if p >= 0 and c == toLowerSinglets[p]:
  462. return Rune(c + toLowerSinglets[p+1] - 500)
  463. return Rune(c)
  464. proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
  465. ## Converts ``c`` into upper case. This works for any rune.
  466. ##
  467. ## If possible, prefer ``toLower`` over ``toUpper``.
  468. ##
  469. ## See also:
  470. ## * `toLower proc <#toLower,Rune>`_
  471. ## * `toTitle proc <#toTitle,Rune>`_
  472. ## * `isUpper proc <#isUpper,Rune>`_
  473. var c = RuneImpl(c)
  474. var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3)
  475. if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]:
  476. return Rune(c + toUpperRanges[p+2] - 500)
  477. p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2)
  478. if p >= 0 and c == toUpperSinglets[p]:
  479. return Rune(c + toUpperSinglets[p+1] - 500)
  480. return Rune(c)
  481. proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
  482. ## Converts ``c`` to title case.
  483. ##
  484. ## See also:
  485. ## * `toLower proc <#toLower,Rune>`_
  486. ## * `toUpper proc <#toUpper,Rune>`_
  487. ## * `isTitle proc <#isTitle,Rune>`_
  488. var c = RuneImpl(c)
  489. var p = binarySearch(c, toTitleSinglets, len(toTitleSinglets) div 2, 2)
  490. if p >= 0 and c == toTitleSinglets[p]:
  491. return Rune(c + toTitleSinglets[p+1] - 500)
  492. return Rune(c)
  493. proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  494. ## Returns true if ``c`` is a lower case rune.
  495. ##
  496. ## If possible, prefer ``isLower`` over ``isUpper``.
  497. ##
  498. ## See also:
  499. ## * `toLower proc <#toLower,Rune>`_
  500. ## * `isUpper proc <#isUpper,Rune>`_
  501. ## * `isTitle proc <#isTitle,Rune>`_
  502. var c = RuneImpl(c)
  503. # Note: toUpperRanges is correct here!
  504. var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3)
  505. if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]:
  506. return true
  507. p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2)
  508. if p >= 0 and c == toUpperSinglets[p]:
  509. return true
  510. else:
  511. return false
  512. proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  513. ## Returns true if ``c`` is a upper case rune.
  514. ##
  515. ## If possible, prefer ``isLower`` over ``isUpper``.
  516. ##
  517. ## See also:
  518. ## * `toUpper proc <#toUpper,Rune>`_
  519. ## * `isLower proc <#isLower,Rune>`_
  520. ## * `isTitle proc <#isTitle,Rune>`_
  521. ## * `isAlpha proc <#isAlpha,Rune>`_
  522. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  523. var c = RuneImpl(c)
  524. # Note: toLowerRanges is correct here!
  525. var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3)
  526. if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]:
  527. return true
  528. p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2)
  529. if p >= 0 and c == toLowerSinglets[p]:
  530. return true
  531. else:
  532. return false
  533. proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  534. ## Returns true if ``c`` is an *alpha* rune (i.e., a letter).
  535. ##
  536. ## See also:
  537. ## * `isLower proc <#isLower,Rune>`_
  538. ## * `isTitle proc <#isTitle,Rune>`_
  539. ## * `isAlpha proc <#isAlpha,Rune>`_
  540. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  541. ## * `isCombining proc <#isCombining,Rune>`_
  542. if isUpper(c) or isLower(c):
  543. return true
  544. var c = RuneImpl(c)
  545. var p = binarySearch(c, alphaRanges, len(alphaRanges) div 2, 2)
  546. if p >= 0 and c >= alphaRanges[p] and c <= alphaRanges[p+1]:
  547. return true
  548. p = binarySearch(c, alphaSinglets, len(alphaSinglets), 1)
  549. if p >= 0 and c == alphaSinglets[p]:
  550. return true
  551. else:
  552. return false
  553. proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  554. ## Returns true if ``c`` is a Unicode titlecase code point.
  555. ##
  556. ## See also:
  557. ## * `toTitle proc <#toTitle,Rune>`_
  558. ## * `isLower proc <#isLower,Rune>`_
  559. ## * `isUpper proc <#isUpper,Rune>`_
  560. ## * `isAlpha proc <#isAlpha,Rune>`_
  561. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  562. return isUpper(c) and isLower(c)
  563. proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  564. ## Returns true if ``c`` is a Unicode whitespace code point.
  565. ##
  566. ## See also:
  567. ## * `isLower proc <#isLower,Rune>`_
  568. ## * `isUpper proc <#isUpper,Rune>`_
  569. ## * `isTitle proc <#isTitle,Rune>`_
  570. ## * `isAlpha proc <#isAlpha,Rune>`_
  571. var c = RuneImpl(c)
  572. var p = binarySearch(c, spaceRanges, len(spaceRanges) div 2, 2)
  573. if p >= 0 and c >= spaceRanges[p] and c <= spaceRanges[p+1]:
  574. return true
  575. else:
  576. return false
  577. proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  578. ## Returns true if ``c`` is a Unicode combining code unit.
  579. ##
  580. ## See also:
  581. ## * `isLower proc <#isLower,Rune>`_
  582. ## * `isUpper proc <#isUpper,Rune>`_
  583. ## * `isTitle proc <#isTitle,Rune>`_
  584. ## * `isAlpha proc <#isAlpha,Rune>`_
  585. var c = RuneImpl(c)
  586. # Optimized to return false immediately for ASCII
  587. return c >= 0x0300 and (c <= 0x036f or
  588. (c >= 0x1ab0 and c <= 0x1aff) or
  589. (c >= 0x1dc0 and c <= 0x1dff) or
  590. (c >= 0x20d0 and c <= 0x20ff) or
  591. (c >= 0xfe20 and c <= 0xfe2f))
  592. template runeCheck(s, runeProc) =
  593. ## Common code for isAlpha and isSpace.
  594. result = if len(s) == 0: false else: true
  595. var
  596. i = 0
  597. rune: Rune
  598. while i < len(s) and result:
  599. fastRuneAt(s, i, rune, doInc = true)
  600. result = runeProc(rune) and result
  601. proc isAlpha*(s: openArray[char]): bool {.noSideEffect,
  602. rtl, extern: "nuc$1Str".} =
  603. ## Returns true if ``s`` contains all alphabetic runes.
  604. runnableExamples:
  605. let a = "añyóng"
  606. doAssert a.isAlpha
  607. runeCheck(s, isAlpha)
  608. proc isSpace*(s: openArray[char]): bool {.noSideEffect,
  609. rtl, extern: "nuc$1Str".} =
  610. ## Returns true if ``s`` contains all whitespace runes.
  611. runnableExamples:
  612. let a = "\t\l \v\r\f"
  613. doAssert a.isSpace
  614. runeCheck(s, isWhiteSpace)
  615. template convertRune(s, runeProc) =
  616. ## Convert runes in ``s`` using ``runeProc`` as the converter.
  617. result = newString(len(s))
  618. var
  619. i = 0
  620. resultIndex = 0
  621. rune: Rune
  622. while i < len(s):
  623. fastRuneAt(s, i, rune, doInc = true)
  624. rune = runeProc(rune)
  625. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  626. proc toUpper*(s: openArray[char]): string {.noSideEffect,
  627. rtl, extern: "nuc$1Str".} =
  628. ## Converts ``s`` into upper-case runes.
  629. runnableExamples:
  630. doAssert toUpper("abγ") == "ABΓ"
  631. convertRune(s, toUpper)
  632. proc toLower*(s: openArray[char]): string {.noSideEffect,
  633. rtl, extern: "nuc$1Str".} =
  634. ## Converts ``s`` into lower-case runes.
  635. runnableExamples:
  636. doAssert toLower("ABΓ") == "abγ"
  637. convertRune(s, toLower)
  638. proc swapCase*(s: openArray[char]): string {.noSideEffect,
  639. rtl, extern: "nuc$1".} =
  640. ## Swaps the case of runes in ``s``.
  641. ##
  642. ## Returns a new string such that the cases of all runes
  643. ## are swapped if possible.
  644. runnableExamples:
  645. doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
  646. var
  647. i = 0
  648. resultIndex = 0
  649. rune: Rune
  650. result = newString(len(s))
  651. while i < len(s):
  652. fastRuneAt(s, i, rune)
  653. if rune.isUpper():
  654. rune = rune.toLower()
  655. elif rune.isLower():
  656. rune = rune.toUpper()
  657. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  658. proc capitalize*(s: openArray[char]): string {.noSideEffect,
  659. rtl, extern: "nuc$1".} =
  660. ## Converts the first character of ``s`` into an upper-case rune.
  661. runnableExamples:
  662. doAssert capitalize("βeta") == "Βeta"
  663. if len(s) == 0:
  664. return ""
  665. var
  666. rune: Rune
  667. i = 0
  668. fastRuneAt(s, i, rune, doInc = true)
  669. result = $toUpper(rune) & substr(s.toOpenArray(i, s.high))
  670. when not defined(nimHasEffectsOf):
  671. {.pragma: effectsOf.}
  672. proc translate*(s: openArray[char], replacements: proc(key: string): string): string {.
  673. rtl, extern: "nuc$1", effectsOf: replacements.} =
  674. ## Translates words in a string using the ``replacements`` proc to substitute
  675. ## words inside ``s`` with their replacements.
  676. ##
  677. ## ``replacements`` is any proc that takes a word and returns
  678. ## a new word to fill it's place.
  679. runnableExamples:
  680. proc wordToNumber(s: string): string =
  681. case s
  682. of "one": "1"
  683. of "two": "2"
  684. else: s
  685. let a = "one two three four"
  686. doAssert a.translate(wordToNumber) == "1 2 three four"
  687. # Allocate memory for the new string based on the old one.
  688. # If the new string length is less than the old, no allocations
  689. # will be needed. If the new string length is greater than the
  690. # old, then maybe only one allocation is needed
  691. result = newStringOfCap(s.len)
  692. var
  693. index = 0
  694. lastIndex = 0
  695. wordStart = 0
  696. inWord = false
  697. rune: Rune
  698. while index < len(s):
  699. lastIndex = index
  700. fastRuneAt(s, index, rune)
  701. let whiteSpace = rune.isWhiteSpace()
  702. if whiteSpace and inWord:
  703. # If we've reached the end of a word
  704. let word = substr(s.toOpenArray(wordStart, lastIndex - 1))
  705. result.add(replacements(word))
  706. result.add($rune)
  707. inWord = false
  708. elif not whiteSpace and not inWord:
  709. # If we've hit a non space character and
  710. # are not currently in a word, track
  711. # the starting index of the word
  712. inWord = true
  713. wordStart = lastIndex
  714. elif whiteSpace:
  715. result.add($rune)
  716. if wordStart < len(s) and inWord:
  717. # Get the trailing word at the end
  718. let word = substr(s.toOpenArray(wordStart, s.high))
  719. result.add(replacements(word))
  720. proc title*(s: openArray[char]): string {.noSideEffect,
  721. rtl, extern: "nuc$1".} =
  722. ## Converts ``s`` to a unicode title.
  723. ##
  724. ## Returns a new string such that the first character
  725. ## in each word inside ``s`` is capitalized.
  726. runnableExamples:
  727. doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
  728. var
  729. i = 0
  730. resultIndex = 0
  731. rune: Rune
  732. result = newString(len(s))
  733. var firstRune = true
  734. while i < len(s):
  735. fastRuneAt(s, i, rune)
  736. if not rune.isWhiteSpace() and firstRune:
  737. rune = rune.toUpper()
  738. firstRune = false
  739. elif rune.isWhiteSpace():
  740. firstRune = true
  741. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  742. iterator runes*(s: openArray[char]): Rune =
  743. ## Iterates over any rune of the string ``s`` returning runes.
  744. var
  745. i = 0
  746. result: Rune
  747. while i < len(s):
  748. fastRuneAt(s, i, result, true)
  749. yield result
  750. iterator utf8*(s: openArray[char]): string =
  751. ## Iterates over any rune of the string ``s`` returning utf8 values.
  752. ##
  753. ## See also:
  754. ## * `validateUtf8 proc <#validateUtf8,string>`_
  755. ## * `toUTF8 proc <#toUTF8,Rune>`_
  756. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  757. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  758. var o = 0
  759. while o < s.len:
  760. let n = runeLenAt(s, o)
  761. yield substr(s.toOpenArray(o, (o+n-1)))
  762. o += n
  763. proc toRunes*(s: openArray[char]): seq[Rune] =
  764. ## Obtains a sequence containing the Runes in ``s``.
  765. ##
  766. ## See also:
  767. ## * `$ proc <#$,Rune>`_ for a reverse operation
  768. runnableExamples:
  769. let a = toRunes("aáä")
  770. doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)]
  771. result = newSeq[Rune]()
  772. for r in s.runes:
  773. result.add(r)
  774. proc cmpRunesIgnoreCase*(a, b: openArray[char]): int {.rtl, extern: "nuc$1".} =
  775. ## Compares two UTF-8 strings and ignores the case. Returns:
  776. ##
  777. ## | `0` if a == b
  778. ## | `< 0` if a < b
  779. ## | `> 0` if a > b
  780. var i = 0
  781. var j = 0
  782. var ar, br: Rune
  783. while i < a.len and j < b.len:
  784. # slow path:
  785. fastRuneAt(a, i, ar)
  786. fastRuneAt(b, j, br)
  787. when sizeof(int) < 4:
  788. const lo = low(int).int32
  789. const hi = high(int).int32
  790. result = clamp(RuneImpl(toLower(ar)) - RuneImpl(toLower(br)), lo, hi).int
  791. else:
  792. result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br))
  793. if result != 0: return
  794. result = a.len - b.len
  795. proc reversed*(s: openArray[char]): string =
  796. ## Returns the reverse of ``s``, interpreting it as runes.
  797. ##
  798. ## Unicode combining characters are correctly interpreted as well.
  799. runnableExamples:
  800. assert reversed("Reverse this!") == "!siht esreveR"
  801. assert reversed("先秦兩漢") == "漢兩秦先"
  802. assert reversed("as⃝df̅") == "f̅ds⃝a"
  803. assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  804. var
  805. i = 0
  806. lastI = 0
  807. newPos = len(s) - 1
  808. blockPos = 0
  809. r: Rune
  810. template reverseUntil(pos) =
  811. var j = pos - 1
  812. while j > blockPos:
  813. result[newPos] = s[j]
  814. dec j
  815. dec newPos
  816. blockPos = pos - 1
  817. result = newString(len(s))
  818. while i < len(s):
  819. lastI = i
  820. fastRuneAt(s, i, r, true)
  821. if not isCombining(r):
  822. reverseUntil(lastI)
  823. reverseUntil(len(s))
  824. proc graphemeLen*(s: openArray[char]; i: Natural): Natural =
  825. ## The number of bytes belonging to byte index ``s[i]``,
  826. ## including following combining code units.
  827. runnableExamples:
  828. let a = "añyóng"
  829. doAssert a.graphemeLen(1) == 2 ## ñ
  830. doAssert a.graphemeLen(2) == 1
  831. doAssert a.graphemeLen(4) == 2 ## ó
  832. result = 0
  833. var j = i.int
  834. var r, r2: Rune
  835. if j < s.len:
  836. fastRuneAt(s, j, r, true)
  837. result = j-i
  838. while j < s.len:
  839. fastRuneAt(s, j, r2, true)
  840. if not isCombining(r2): break
  841. result = j-i
  842. proc lastRune*(s: openArray[char]; last: int): (Rune, int) =
  843. ## Length of the last rune in ``s[0..last]``. Returns the rune and its length
  844. ## in bytes.
  845. if s[last] <= chr(127):
  846. result = (Rune(s[last]), 1)
  847. else:
  848. var L = 0
  849. while last-L >= 0 and uint(s[last-L]) shr 6 == 0b10: inc(L)
  850. var r: Rune
  851. fastRuneAt(s, last-L, r, false)
  852. result = (r, L+1)
  853. proc size*(r: Rune): int {.noSideEffect.} =
  854. ## Returns the number of bytes the rune ``r`` takes.
  855. runnableExamples:
  856. let a = toRunes "aá"
  857. doAssert size(a[0]) == 1
  858. doAssert size(a[1]) == 2
  859. let v = r.uint32
  860. if v <= 0x007F'u32: result = 1
  861. elif v <= 0x07FF'u32: result = 2
  862. elif v <= 0xFFFF'u32: result = 3
  863. elif v <= 0x1FFFFF'u32: result = 4
  864. elif v <= 0x3FFFFFF'u32: result = 5
  865. elif v <= 0x7FFFFFFF'u32: result = 6
  866. else: result = 1
  867. # --------- Private templates for different split separators -----------
  868. proc stringHasSep(s: openArray[char], index: int, seps: openArray[Rune]): bool =
  869. var rune: Rune
  870. fastRuneAt(s, index, rune, false)
  871. return seps.contains(rune)
  872. proc stringHasSep(s: openArray[char], index: int, sep: Rune): bool =
  873. var rune: Rune
  874. fastRuneAt(s, index, rune, false)
  875. return sep == rune
  876. template splitCommon(s, sep, maxsplit: untyped) =
  877. ## Common code for split procedures.
  878. let
  879. sLen = len(s)
  880. var
  881. last = 0
  882. splits = maxsplit
  883. if sLen > 0:
  884. while last <= sLen:
  885. var first = last
  886. while last < sLen and not stringHasSep(s, last, sep):
  887. inc(last, runeLenAt(s, last))
  888. if splits == 0: last = sLen
  889. yield substr(s.toOpenArray(first, (last - 1)))
  890. if splits == 0: break
  891. dec(splits)
  892. inc(last, if last < sLen: runeLenAt(s, last) else: 1)
  893. iterator split*(s: openArray[char], seps: openArray[Rune] = unicodeSpaces,
  894. maxsplit: int = -1): string =
  895. ## Splits the unicode string ``s`` into substrings using a group of separators.
  896. ##
  897. ## Substrings are separated by a substring containing only ``seps``.
  898. runnableExamples:
  899. import std/sequtils
  900. assert toSeq("hÃllo\lthis\lis an\texample\l是".split) ==
  901. @["hÃllo", "this", "is", "an", "example", "是"]
  902. # And the following code splits the same string using a sequence of Runes.
  903. assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) ==
  904. @["añyóng", "hÃllo", "是", "example"]
  905. # example with a `Rune` separator and unused one `;`:
  906. assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""]
  907. # Another example that splits a string containing a date.
  908. let date = "2012-11-20T22:08:08.398990"
  909. assert toSeq(split(date, " -:T".toRunes)) ==
  910. @["2012", "11", "20", "22", "08", "08.398990"]
  911. splitCommon(s, seps, maxsplit)
  912. iterator splitWhitespace*(s: openArray[char]): string =
  913. ## Splits a unicode string at whitespace runes.
  914. splitCommon(s, unicodeSpaces, -1)
  915. template accResult(iter: untyped) =
  916. result = @[]
  917. for x in iter: add(result, x)
  918. proc splitWhitespace*(s: openArray[char]): seq[string] {.noSideEffect,
  919. rtl, extern: "ncuSplitWhitespace".} =
  920. ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_
  921. ## iterator, but is a proc that returns a sequence of substrings.
  922. accResult(splitWhitespace(s))
  923. iterator split*(s: openArray[char], sep: Rune, maxsplit: int = -1): string =
  924. ## Splits the unicode string ``s`` into substrings using a single separator.
  925. ## Substrings are separated by the rune ``sep``.
  926. runnableExamples:
  927. import std/sequtils
  928. assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) ==
  929. @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"]
  930. splitCommon(s, sep, maxsplit)
  931. proc split*(s: openArray[char], seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1):
  932. seq[string] {.noSideEffect, rtl, extern: "nucSplitRunes".} =
  933. ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_,
  934. ## but is a proc that returns a sequence of substrings.
  935. accResult(split(s, seps, maxsplit))
  936. proc split*(s: openArray[char], sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect,
  937. rtl, extern: "nucSplitRune".} =
  938. ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc
  939. ## that returns a sequence of substrings.
  940. accResult(split(s, sep, maxsplit))
  941. proc strip*(s: openArray[char], leading = true, trailing = true,
  942. runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect,
  943. rtl, extern: "nucStrip".} =
  944. ## Strips leading or trailing ``runes`` from ``s`` and returns
  945. ## the resulting string.
  946. ##
  947. ## If ``leading`` is true (default), leading ``runes`` are stripped.
  948. ## If ``trailing`` is true (default), trailing ``runes`` are stripped.
  949. ## If both are false, the string is returned unchanged.
  950. runnableExamples:
  951. let a = "\táñyóng "
  952. doAssert a.strip == "áñyóng"
  953. doAssert a.strip(leading = false) == "\táñyóng"
  954. doAssert a.strip(trailing = false) == "áñyóng "
  955. var
  956. sI = 0 ## starting index into string ``s``
  957. eI = len(s) - 1 ## ending index into ``s``, where the last ``Rune`` starts
  958. if leading:
  959. var
  960. i = 0
  961. xI: int ## value of ``sI`` at the beginning of the iteration
  962. rune: Rune
  963. while i < len(s):
  964. xI = i
  965. fastRuneAt(s, i, rune)
  966. sI = i # Assume to start from next rune
  967. if not runes.contains(rune):
  968. sI = xI # Go back to where the current rune starts
  969. break
  970. if trailing:
  971. var
  972. i = eI
  973. xI: int
  974. rune: Rune
  975. while i >= 0:
  976. xI = i
  977. fastRuneAt(s, xI, rune)
  978. var yI = i - 1
  979. while yI >= 0:
  980. var
  981. yIend = yI
  982. pRune: Rune
  983. fastRuneAt(s, yIend, pRune)
  984. if yIend < xI: break
  985. i = yI
  986. rune = pRune
  987. dec(yI)
  988. if not runes.contains(rune):
  989. eI = xI - 1
  990. break
  991. dec(i)
  992. let newLen = eI - sI + 1
  993. result = newStringOfCap(newLen)
  994. if newLen > 0:
  995. result.add substr(s.toOpenArray(sI, eI))
  996. proc repeat*(c: Rune, count: Natural): string {.noSideEffect,
  997. rtl, extern: "nucRepeatRune".} =
  998. ## Returns a string of ``count`` Runes ``c``.
  999. ##
  1000. ## The returned string will have a rune-length of ``count``.
  1001. runnableExamples:
  1002. let a = "ñ".runeAt(0)
  1003. doAssert a.repeat(5) == "ñññññ"
  1004. let s = $c
  1005. result = newStringOfCap(count * s.len)
  1006. for i in 0 ..< count:
  1007. result.add s
  1008. proc align*(s: openArray[char], count: Natural, padding = ' '.Rune): string {.
  1009. noSideEffect, rtl, extern: "nucAlignString".} =
  1010. ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length
  1011. ## of ``count``.
  1012. ##
  1013. ## ``padding`` characters (by default spaces) are added before ``s`` resulting in
  1014. ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1015. ## returned unchanged. If you need to left align a string use the `alignLeft
  1016. ## proc <#alignLeft,string,Natural>`_.
  1017. runnableExamples:
  1018. assert align("abc", 4) == " abc"
  1019. assert align("a", 0) == "a"
  1020. assert align("1232", 6) == " 1232"
  1021. assert align("1232", 6, '#'.Rune) == "##1232"
  1022. assert align("Åge", 5) == " Åge"
  1023. assert align("×", 4, '_'.Rune) == "___×"
  1024. let sLen = s.runeLen
  1025. if sLen < count:
  1026. let padStr = $padding
  1027. result = newStringOfCap(padStr.len * count)
  1028. let spaces = count - sLen
  1029. for i in 0 ..< spaces: result.add padStr
  1030. result.add s
  1031. else:
  1032. result = s.substr
  1033. proc alignLeft*(s: openArray[char], count: Natural, padding = ' '.Rune): string {.
  1034. noSideEffect.} =
  1035. ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a
  1036. ## rune-length of ``count``.
  1037. ##
  1038. ## ``padding`` characters (by default spaces) are added after ``s`` resulting in
  1039. ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1040. ## returned unchanged. If you need to right align a string use the `align
  1041. ## proc <#align,string,Natural>`_.
  1042. runnableExamples:
  1043. assert alignLeft("abc", 4) == "abc "
  1044. assert alignLeft("a", 0) == "a"
  1045. assert alignLeft("1232", 6) == "1232 "
  1046. assert alignLeft("1232", 6, '#'.Rune) == "1232##"
  1047. assert alignLeft("Åge", 5) == "Åge "
  1048. assert alignLeft("×", 4, '_'.Rune) == "×___"
  1049. let sLen = s.runeLen
  1050. if sLen < count:
  1051. let padStr = $padding
  1052. result = newStringOfCap(s.len + (count - sLen) * padStr.len)
  1053. result.add s
  1054. for i in sLen ..< count:
  1055. result.add padStr
  1056. else:
  1057. result = s.substr
  1058. proc runeLen*(s: string): int {.inline.} =
  1059. ## Returns the number of runes of the string ``s``.
  1060. runnableExamples:
  1061. let a = "añyóng"
  1062. doAssert a.runeLen == 6
  1063. ## note: a.len == 8
  1064. runeLen(toOa(s))
  1065. proc runeLenAt*(s: string, i: Natural): int {.inline.} =
  1066. ## Returns the number of bytes the rune starting at ``s[i]`` takes.
  1067. ##
  1068. ## See also:
  1069. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  1070. runnableExamples:
  1071. let a = "añyóng"
  1072. doAssert a.runeLenAt(0) == 1
  1073. doAssert a.runeLenAt(1) == 2
  1074. runeLenAt(toOa(s), i)
  1075. proc runeAt*(s: string, i: Natural): Rune {.inline.} =
  1076. ## Returns the rune in ``s`` at **byte index** ``i``.
  1077. ##
  1078. ## See also:
  1079. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  1080. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  1081. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  1082. runnableExamples:
  1083. let a = "añyóng"
  1084. doAssert a.runeAt(1) == "ñ".runeAt(0)
  1085. doAssert a.runeAt(2) == "ñ".runeAt(1)
  1086. doAssert a.runeAt(3) == "y".runeAt(0)
  1087. fastRuneAt(s, i, result, false)
  1088. proc validateUtf8*(s: string): int {.inline.} =
  1089. ## Returns the position of the invalid byte in ``s`` if the string ``s`` does
  1090. ## not hold valid UTF-8 data. Otherwise ``-1`` is returned.
  1091. ##
  1092. ## See also:
  1093. ## * `toUTF8 proc <#toUTF8,Rune>`_
  1094. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  1095. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  1096. validateUtf8(toOa(s))
  1097. proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int {.inline.} =
  1098. ## Returns the byte position of rune
  1099. ## at position ``pos`` in ``s`` with an optional start byte position.
  1100. ## Returns the special value -1 if it runs out of the string.
  1101. ##
  1102. ## **Beware:** This can lead to unoptimized code and slow execution!
  1103. ## Most problems can be solved more efficiently by using an iterator
  1104. ## or conversion to a seq of Rune.
  1105. ##
  1106. ## See also:
  1107. ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_
  1108. runnableExamples:
  1109. let a = "añyóng"
  1110. doAssert a.runeOffset(1) == 1
  1111. doAssert a.runeOffset(3) == 4
  1112. doAssert a.runeOffset(4) == 6
  1113. runeOffset(toOa(s), pos, start)
  1114. proc runeReverseOffset*(s: string, rev: Positive): (int, int) {.inline.} =
  1115. ## Returns a tuple with the byte offset of the
  1116. ## rune at position ``rev`` in ``s``, counting
  1117. ## from the end (starting with 1) and the total
  1118. ## number of runes in the string.
  1119. ##
  1120. ## Returns a negative value for offset if there are too few runes in
  1121. ## the string to satisfy the request.
  1122. ##
  1123. ## **Beware:** This can lead to unoptimized code and slow execution!
  1124. ## Most problems can be solved more efficiently by using an iterator
  1125. ## or conversion to a seq of Rune.
  1126. ##
  1127. ## See also:
  1128. ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_
  1129. runeReverseOffset(toOa(s), rev)
  1130. proc runeAtPos*(s: string, pos: int): Rune {.inline.} =
  1131. ## Returns the rune at position ``pos``.
  1132. ##
  1133. ## **Beware:** This can lead to unoptimized code and slow execution!
  1134. ## Most problems can be solved more efficiently by using an iterator
  1135. ## or conversion to a seq of Rune.
  1136. ##
  1137. ## See also:
  1138. ## * `runeAt proc <#runeAt,string,Natural>`_
  1139. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  1140. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  1141. fastRuneAt(toOa(s), runeOffset(s, pos), result, false)
  1142. proc runeStrAtPos*(s: string, pos: Natural): string {.inline.} =
  1143. ## Returns the rune at position ``pos`` as UTF8 String.
  1144. ##
  1145. ## **Beware:** This can lead to unoptimized code and slow execution!
  1146. ## Most problems can be solved more efficiently by using an iterator
  1147. ## or conversion to a seq of Rune.
  1148. ##
  1149. ## See also:
  1150. ## * `runeAt proc <#runeAt,string,Natural>`_
  1151. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  1152. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  1153. let o = runeOffset(s, pos)
  1154. substr(s.toOpenArray(o, (o+runeLenAt(s, o)-1)))
  1155. proc runeSubStr*(s: string, pos: int, len: int = int.high): string {.inline.} =
  1156. ## Returns the UTF-8 substring starting at code point ``pos``
  1157. ## with ``len`` code points.
  1158. ##
  1159. ## If ``pos`` or ``len`` is negative they count from
  1160. ## the end of the string. If ``len`` is not given it means the longest
  1161. ## possible string.
  1162. runnableExamples:
  1163. let s = "Hänsel ««: 10,00€"
  1164. doAssert(runeSubStr(s, 0, 2) == "Hä")
  1165. doAssert(runeSubStr(s, 10, 1) == ":")
  1166. doAssert(runeSubStr(s, -6) == "10,00€")
  1167. doAssert(runeSubStr(s, 10) == ": 10,00€")
  1168. doAssert(runeSubStr(s, 12, 5) == "10,00")
  1169. doAssert(runeSubStr(s, -6, 3) == "10,")
  1170. runeSubStr(toOa(s), pos, len)
  1171. proc isAlpha*(s: string): bool {.noSideEffect, inline.} =
  1172. ## Returns true if ``s`` contains all alphabetic runes.
  1173. runnableExamples:
  1174. let a = "añyóng"
  1175. doAssert a.isAlpha
  1176. isAlpha(toOa(s))
  1177. proc isSpace*(s: string): bool {.noSideEffect, inline.} =
  1178. ## Returns true if ``s`` contains all whitespace runes.
  1179. runnableExamples:
  1180. let a = "\t\l \v\r\f"
  1181. doAssert a.isSpace
  1182. isSpace(toOa(s))
  1183. proc toUpper*(s: string): string {.noSideEffect, inline.} =
  1184. ## Converts ``s`` into upper-case runes.
  1185. runnableExamples:
  1186. doAssert toUpper("abγ") == "ABΓ"
  1187. toUpper(toOa(s))
  1188. proc toLower*(s: string): string {.noSideEffect, inline.} =
  1189. ## Converts ``s`` into lower-case runes.
  1190. runnableExamples:
  1191. doAssert toLower("ABΓ") == "abγ"
  1192. toLower(toOa(s))
  1193. proc swapCase*(s: string): string {.noSideEffect, inline.} =
  1194. ## Swaps the case of runes in ``s``.
  1195. ##
  1196. ## Returns a new string such that the cases of all runes
  1197. ## are swapped if possible.
  1198. runnableExamples:
  1199. doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
  1200. swapCase(toOa(s))
  1201. proc capitalize*(s: string): string {.noSideEffect.} =
  1202. ## Converts the first character of ``s`` into an upper-case rune.
  1203. runnableExamples:
  1204. doAssert capitalize("βeta") == "Βeta"
  1205. capitalize(toOa(s))
  1206. proc translate*(s: string, replacements: proc(key: string): string): string {.effectsOf: replacements, inline.} =
  1207. ## Translates words in a string using the ``replacements`` proc to substitute
  1208. ## words inside ``s`` with their replacements.
  1209. ##
  1210. ## ``replacements`` is any proc that takes a word and returns
  1211. ## a new word to fill it's place.
  1212. runnableExamples:
  1213. proc wordToNumber(s: string): string =
  1214. case s
  1215. of "one": "1"
  1216. of "two": "2"
  1217. else: s
  1218. let a = "one two three four"
  1219. doAssert a.translate(wordToNumber) == "1 2 three four"
  1220. translate(toOa(s), replacements)
  1221. proc title*(s: string): string {.noSideEffect, inline.} =
  1222. ## Converts ``s`` to a unicode title.
  1223. ##
  1224. ## Returns a new string such that the first character
  1225. ## in each word inside ``s`` is capitalized.
  1226. runnableExamples:
  1227. doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
  1228. title(toOa(s))
  1229. iterator runes*(s: string): Rune =
  1230. ## Iterates over any rune of the string ``s`` returning runes.
  1231. for rune in runes(toOa(s)):
  1232. yield rune
  1233. iterator utf8*(s: string): string =
  1234. ## Iterates over any rune of the string ``s`` returning utf8 values.
  1235. ##
  1236. ## See also:
  1237. ## * `validateUtf8 proc <#validateUtf8,string>`_
  1238. ## * `toUTF8 proc <#toUTF8,Rune>`_
  1239. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  1240. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  1241. for str in utf8(toOa(s)):
  1242. yield str
  1243. proc toRunes*(s: string): seq[Rune] {.inline.} =
  1244. ## Obtains a sequence containing the Runes in ``s``.
  1245. ##
  1246. ## See also:
  1247. ## * `$ proc <#$,Rune>`_ for a reverse operation
  1248. runnableExamples:
  1249. let a = toRunes("aáä")
  1250. doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)]
  1251. toRunes(toOa(s))
  1252. proc cmpRunesIgnoreCase*(a, b: string): int {.inline.} =
  1253. ## Compares two UTF-8 strings and ignores the case. Returns:
  1254. ##
  1255. ## | `0` if a == b
  1256. ## | `< 0` if a < b
  1257. ## | `> 0` if a > b
  1258. cmpRunesIgnoreCase(a.toOa(), b.toOa())
  1259. proc reversed*(s: string): string {.inline.} =
  1260. ## Returns the reverse of ``s``, interpreting it as runes.
  1261. ##
  1262. ## Unicode combining characters are correctly interpreted as well.
  1263. runnableExamples:
  1264. assert reversed("Reverse this!") == "!siht esreveR"
  1265. assert reversed("先秦兩漢") == "漢兩秦先"
  1266. assert reversed("as⃝df̅") == "f̅ds⃝a"
  1267. assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  1268. reversed(toOa(s))
  1269. proc graphemeLen*(s: string; i: Natural): Natural {.inline.} =
  1270. ## The number of bytes belonging to byte index ``s[i]``,
  1271. ## including following combining code unit.
  1272. runnableExamples:
  1273. let a = "añyóng"
  1274. doAssert a.graphemeLen(1) == 2 ## ñ
  1275. doAssert a.graphemeLen(2) == 1
  1276. doAssert a.graphemeLen(4) == 2 ## ó
  1277. graphemeLen(toOa(s), i)
  1278. proc lastRune*(s: string; last: int): (Rune, int) {.inline.} =
  1279. ## Length of the last rune in ``s[0..last]``. Returns the rune and its length
  1280. ## in bytes.
  1281. lastRune(toOa(s), last)
  1282. iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces,
  1283. maxsplit: int = -1): string =
  1284. ## Splits the unicode string ``s`` into substrings using a group of separators.
  1285. ##
  1286. ## Substrings are separated by a substring containing only ``seps``.
  1287. runnableExamples:
  1288. import std/sequtils
  1289. assert toSeq("hÃllo\lthis\lis an\texample\l是".split) ==
  1290. @["hÃllo", "this", "is", "an", "example", "是"]
  1291. # And the following code splits the same string using a sequence of Runes.
  1292. assert toSeq(split("añyóng:hÃllo;是$example", ";:$".toRunes)) ==
  1293. @["añyóng", "hÃllo", "是", "example"]
  1294. # example with a `Rune` separator and unused one `;`:
  1295. assert toSeq(split("ab是de:f:", ";:是".toRunes)) == @["ab", "de", "f", ""]
  1296. # Another example that splits a string containing a date.
  1297. let date = "2012-11-20T22:08:08.398990"
  1298. assert toSeq(split(date, " -:T".toRunes)) ==
  1299. @["2012", "11", "20", "22", "08", "08.398990"]
  1300. splitCommon(toOa(s), seps, maxsplit)
  1301. iterator splitWhitespace*(s: string): string =
  1302. ## Splits a unicode string at whitespace runes.
  1303. splitCommon(s.toOa(), unicodeSpaces, -1)
  1304. proc splitWhitespace*(s: string): seq[string] {.noSideEffect, inline.}=
  1305. ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_
  1306. ## iterator, but is a proc that returns a sequence of substrings.
  1307. accResult(splitWhitespace(toOa(s)))
  1308. iterator split*(s: string, sep: Rune, maxsplit: int = -1): string =
  1309. ## Splits the unicode string ``s`` into substrings using a single separator.
  1310. ## Substrings are separated by the rune ``sep``.
  1311. runnableExamples:
  1312. import std/sequtils
  1313. assert toSeq(split(";;hÃllo;this;is;an;;example;;;是", ";".runeAt(0))) ==
  1314. @["", "", "hÃllo", "this", "is", "an", "", "example", "", "", "是"]
  1315. splitCommon(toOa(s), sep, maxsplit)
  1316. proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1):
  1317. seq[string] {.noSideEffect, inline.} =
  1318. ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_,
  1319. ## but is a proc that returns a sequence of substrings.
  1320. accResult(split(toOa(s), seps, maxsplit))
  1321. proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect, inline.} =
  1322. ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc
  1323. ## that returns a sequence of substrings.
  1324. accResult(split(toOa(s), sep, maxsplit))
  1325. proc strip*(s: string, leading = true, trailing = true,
  1326. runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect, inline.} =
  1327. ## Strips leading or trailing ``runes`` from ``s`` and returns
  1328. ## the resulting string.
  1329. ##
  1330. ## If ``leading`` is true (default), leading ``runes`` are stripped.
  1331. ## If ``trailing`` is true (default), trailing ``runes`` are stripped.
  1332. ## If both are false, the string is returned unchanged.
  1333. runnableExamples:
  1334. let a = "\táñyóng "
  1335. doAssert a.strip == "áñyóng"
  1336. doAssert a.strip(leading = false) == "\táñyóng"
  1337. doAssert a.strip(trailing = false) == "áñyóng "
  1338. strip(toOa(s), leading, trailing, runes)
  1339. proc align*(s: string, count: Natural, padding = ' '.Rune): string {.noSideEffect, inline.} =
  1340. ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length
  1341. ## of ``count``.
  1342. ##
  1343. ## ``padding`` characters (by default spaces) are added before ``s`` resulting in
  1344. ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1345. ## returned unchanged. If you need to left align a string use the `alignLeft
  1346. ## proc <#alignLeft,string,Natural>`_.
  1347. runnableExamples:
  1348. assert align("abc", 4) == " abc"
  1349. assert align("a", 0) == "a"
  1350. assert align("1232", 6) == " 1232"
  1351. assert align("1232", 6, '#'.Rune) == "##1232"
  1352. assert align("Åge", 5) == " Åge"
  1353. assert align("×", 4, '_'.Rune) == "___×"
  1354. align(toOa(s), count, padding)
  1355. proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {.noSideEffect, inline.} =
  1356. ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a
  1357. ## rune-length of ``count``.
  1358. ##
  1359. ## ``padding`` characters (by default spaces) are added after ``s`` resulting in
  1360. ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1361. ## returned unchanged. If you need to right align a string use the `align
  1362. ## proc <#align,string,Natural>`_.
  1363. runnableExamples:
  1364. assert alignLeft("abc", 4) == "abc "
  1365. assert alignLeft("a", 0) == "a"
  1366. assert alignLeft("1232", 6) == "1232 "
  1367. assert alignLeft("1232", 6, '#'.Rune) == "1232##"
  1368. assert alignLeft("Åge", 5) == "Åge "
  1369. assert alignLeft("×", 4, '_'.Rune) == "×___"
  1370. alignLeft(toOa(s), count, padding)