unicode.nim 44 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module provides support to handle the Unicode UTF-8 encoding.
  10. ##
  11. ## There are no specialized ``insert``, ``delete``, ``add`` and ``contains``
  12. ## procedures for ``seq[Rune]`` in this module because the generic variants
  13. ## of these procedures in the system module already work with it.
  14. ##
  15. ## The current version is compatible with Unicode v12.0.0.
  16. ##
  17. ## **See also:**
  18. ## * `strutils module <strutils.html>`_
  19. ## * `unidecode module <unidecode.html>`_
  20. ## * `encodings module <encodings.html>`_
  21. include "system/inclrtl"
  22. type
  23. RuneImpl = int32 # underlying type of Rune
  24. Rune* = distinct RuneImpl ## \
  25. ## Type that can hold a single Unicode code point.
  26. ##
  27. ## A Rune may be composed with other Runes to a character on the screen.
  28. ## `RuneImpl` is the underlying type used to store Runes, currently `int32`.
  29. template ones(n: untyped): untyped = ((1 shl n)-1)
  30. proc runeLen*(s: string): int {.rtl, extern: "nuc$1".} =
  31. ## Returns the number of runes of the string ``s``.
  32. runnableExamples:
  33. let a = "añyóng"
  34. doAssert a.runeLen == 6
  35. ## note: a.len == 8
  36. result = 0
  37. var i = 0
  38. while i < len(s):
  39. if uint(s[i]) <= 127: inc(i)
  40. elif uint(s[i]) shr 5 == 0b110: inc(i, 2)
  41. elif uint(s[i]) shr 4 == 0b1110: inc(i, 3)
  42. elif uint(s[i]) shr 3 == 0b11110: inc(i, 4)
  43. elif uint(s[i]) shr 2 == 0b111110: inc(i, 5)
  44. elif uint(s[i]) shr 1 == 0b1111110: inc(i, 6)
  45. else: inc i
  46. inc(result)
  47. proc runeLenAt*(s: string, i: Natural): int =
  48. ## Returns the number of bytes the rune starting at ``s[i]`` takes.
  49. ##
  50. ## See also:
  51. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  52. runnableExamples:
  53. let a = "añyóng"
  54. doAssert a.runeLenAt(0) == 1
  55. doAssert a.runeLenAt(1) == 2
  56. if uint(s[i]) <= 127: result = 1
  57. elif uint(s[i]) shr 5 == 0b110: result = 2
  58. elif uint(s[i]) shr 4 == 0b1110: result = 3
  59. elif uint(s[i]) shr 3 == 0b11110: result = 4
  60. elif uint(s[i]) shr 2 == 0b111110: result = 5
  61. elif uint(s[i]) shr 1 == 0b1111110: result = 6
  62. else: result = 1
  63. const replRune = Rune(0xFFFD)
  64. template fastRuneAt*(s: string, i: int, result: untyped, doInc = true) =
  65. ## Returns the rune ``s[i]`` in ``result``.
  66. ##
  67. ## If ``doInc == true`` (default), ``i`` is incremented by the number
  68. ## of bytes that have been processed.
  69. bind ones
  70. if uint(s[i]) <= 127:
  71. result = Rune(uint(s[i]))
  72. when doInc: inc(i)
  73. elif uint(s[i]) shr 5 == 0b110:
  74. # assert(uint(s[i+1]) shr 6 == 0b10)
  75. if i <= s.len - 2:
  76. result = Rune((uint(s[i]) and (ones(5))) shl 6 or
  77. (uint(s[i+1]) and ones(6)))
  78. when doInc: inc(i, 2)
  79. else:
  80. result = replRune
  81. when doInc: inc(i)
  82. elif uint(s[i]) shr 4 == 0b1110:
  83. # assert(uint(s[i+1]) shr 6 == 0b10)
  84. # assert(uint(s[i+2]) shr 6 == 0b10)
  85. if i <= s.len - 3:
  86. result = Rune((uint(s[i]) and ones(4)) shl 12 or
  87. (uint(s[i+1]) and ones(6)) shl 6 or
  88. (uint(s[i+2]) and ones(6)))
  89. when doInc: inc(i, 3)
  90. else:
  91. result = replRune
  92. when doInc: inc(i)
  93. elif uint(s[i]) shr 3 == 0b11110:
  94. # assert(uint(s[i+1]) shr 6 == 0b10)
  95. # assert(uint(s[i+2]) shr 6 == 0b10)
  96. # assert(uint(s[i+3]) shr 6 == 0b10)
  97. if i <= s.len - 4:
  98. result = Rune((uint(s[i]) and ones(3)) shl 18 or
  99. (uint(s[i+1]) and ones(6)) shl 12 or
  100. (uint(s[i+2]) and ones(6)) shl 6 or
  101. (uint(s[i+3]) and ones(6)))
  102. when doInc: inc(i, 4)
  103. else:
  104. result = replRune
  105. when doInc: inc(i)
  106. elif uint(s[i]) shr 2 == 0b111110:
  107. # assert(uint(s[i+1]) shr 6 == 0b10)
  108. # assert(uint(s[i+2]) shr 6 == 0b10)
  109. # assert(uint(s[i+3]) shr 6 == 0b10)
  110. # assert(uint(s[i+4]) shr 6 == 0b10)
  111. if i <= s.len - 5:
  112. result = Rune((uint(s[i]) and ones(2)) shl 24 or
  113. (uint(s[i+1]) and ones(6)) shl 18 or
  114. (uint(s[i+2]) and ones(6)) shl 12 or
  115. (uint(s[i+3]) and ones(6)) shl 6 or
  116. (uint(s[i+4]) and ones(6)))
  117. when doInc: inc(i, 5)
  118. else:
  119. result = replRune
  120. when doInc: inc(i)
  121. elif uint(s[i]) shr 1 == 0b1111110:
  122. # assert(uint(s[i+1]) shr 6 == 0b10)
  123. # assert(uint(s[i+2]) shr 6 == 0b10)
  124. # assert(uint(s[i+3]) shr 6 == 0b10)
  125. # assert(uint(s[i+4]) shr 6 == 0b10)
  126. # assert(uint(s[i+5]) shr 6 == 0b10)
  127. if i <= s.len - 6:
  128. result = Rune((uint(s[i]) and ones(1)) shl 30 or
  129. (uint(s[i+1]) and ones(6)) shl 24 or
  130. (uint(s[i+2]) and ones(6)) shl 18 or
  131. (uint(s[i+3]) and ones(6)) shl 12 or
  132. (uint(s[i+4]) and ones(6)) shl 6 or
  133. (uint(s[i+5]) and ones(6)))
  134. when doInc: inc(i, 6)
  135. else:
  136. result = replRune
  137. when doInc: inc(i)
  138. else:
  139. result = Rune(uint(s[i]))
  140. when doInc: inc(i)
  141. proc runeAt*(s: string, i: Natural): Rune =
  142. ## Returns the rune in ``s`` at **byte index** ``i``.
  143. ##
  144. ## See also:
  145. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  146. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  147. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  148. runnableExamples:
  149. let a = "añyóng"
  150. doAssert a.runeAt(1) == "ñ".runeAt(0)
  151. doAssert a.runeAt(2) == "ñ".runeAt(1)
  152. doAssert a.runeAt(3) == "y".runeAt(0)
  153. fastRuneAt(s, i, result, false)
  154. proc validateUtf8*(s: string): int =
  155. ## Returns the position of the invalid byte in ``s`` if the string ``s`` does
  156. ## not hold valid UTF-8 data. Otherwise ``-1`` is returned.
  157. ##
  158. ## See also:
  159. ## * `toUTF8 proc <#toUTF8,Rune>`_
  160. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  161. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  162. var i = 0
  163. let L = s.len
  164. while i < L:
  165. if uint(s[i]) <= 127:
  166. inc(i)
  167. elif uint(s[i]) shr 5 == 0b110:
  168. if uint(s[i]) < 0xc2: return i # Catch overlong ascii representations.
  169. if i+1 < L and uint(s[i+1]) shr 6 == 0b10: inc(i, 2)
  170. else: return i
  171. elif uint(s[i]) shr 4 == 0b1110:
  172. if i+2 < L and uint(s[i+1]) shr 6 == 0b10 and uint(s[i+2]) shr 6 == 0b10:
  173. inc i, 3
  174. else: return i
  175. elif uint(s[i]) shr 3 == 0b11110:
  176. if i+3 < L and uint(s[i+1]) shr 6 == 0b10 and
  177. uint(s[i+2]) shr 6 == 0b10 and
  178. uint(s[i+3]) shr 6 == 0b10:
  179. inc i, 4
  180. else: return i
  181. else:
  182. return i
  183. return -1
  184. template fastToUTF8Copy*(c: Rune, s: var string, pos: int, doInc = true) =
  185. ## Copies UTF-8 representation of ``c`` into the preallocated string ``s``
  186. ## starting at position ``pos``.
  187. ##
  188. ## If ``doInc == true`` (default), ``pos`` is incremented
  189. ## by the number of bytes that have been processed.
  190. ##
  191. ## To be the most efficient, make sure ``s`` is preallocated
  192. ## with an additional amount equal to the byte length of ``c``.
  193. ##
  194. ## See also:
  195. ## * `validateUtf8 proc <#validateUtf8,string>`_
  196. ## * `toUTF8 proc <#toUTF8,Rune>`_
  197. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  198. var i = RuneImpl(c)
  199. if i <=% 127:
  200. s.setLen(pos+1)
  201. s[pos+0] = chr(i)
  202. when doInc: inc(pos)
  203. elif i <=% 0x07FF:
  204. s.setLen(pos+2)
  205. s[pos+0] = chr((i shr 6) or 0b110_00000)
  206. s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
  207. when doInc: inc(pos, 2)
  208. elif i <=% 0xFFFF:
  209. s.setLen(pos+3)
  210. s[pos+0] = chr(i shr 12 or 0b1110_0000)
  211. s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  212. s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
  213. when doInc: inc(pos, 3)
  214. elif i <=% 0x001FFFFF:
  215. s.setLen(pos+4)
  216. s[pos+0] = chr(i shr 18 or 0b1111_0000)
  217. s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  218. s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  219. s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
  220. when doInc: inc(pos, 4)
  221. elif i <=% 0x03FFFFFF:
  222. s.setLen(pos+5)
  223. s[pos+0] = chr(i shr 24 or 0b111110_00)
  224. s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  225. s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  226. s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  227. s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
  228. when doInc: inc(pos, 5)
  229. elif i <=% 0x7FFFFFFF:
  230. s.setLen(pos+6)
  231. s[pos+0] = chr(i shr 30 or 0b1111110_0)
  232. s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
  233. s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
  234. s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
  235. s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
  236. s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
  237. when doInc: inc(pos, 6)
  238. else:
  239. discard # error, exception?
  240. proc toUTF8*(c: Rune): string {.rtl, extern: "nuc$1".} =
  241. ## Converts a rune into its UTF-8 representation.
  242. ##
  243. ## See also:
  244. ## * `validateUtf8 proc <#validateUtf8,string>`_
  245. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  246. ## * `utf8 iterator <#utf8.i,string>`_
  247. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  248. runnableExamples:
  249. let a = "añyóng"
  250. doAssert a.runeAt(1).toUTF8 == "ñ"
  251. result = ""
  252. fastToUTF8Copy(c, result, 0, false)
  253. proc add*(s: var string; c: Rune) =
  254. ## Adds a rune ``c`` to a string ``s``.
  255. runnableExamples:
  256. var s = "abc"
  257. let c = "ä".runeAt(0)
  258. s.add(c)
  259. doAssert s == "abcä"
  260. let pos = s.len
  261. fastToUTF8Copy(c, s, pos, false)
  262. proc `$`*(rune: Rune): string =
  263. ## An alias for `toUTF8 <#toUTF8,Rune>`_.
  264. ##
  265. ## See also:
  266. ## * `validateUtf8 proc <#validateUtf8,string>`_
  267. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  268. rune.toUTF8
  269. proc `$`*(runes: seq[Rune]): string =
  270. ## Converts a sequence of Runes to a string.
  271. ##
  272. ## See also:
  273. ## * `toRunes <#toRunes,string>`_ for a reverse operation
  274. runnableExamples:
  275. let
  276. someString = "öÑ"
  277. someRunes = toRunes(someString)
  278. doAssert $someRunes == someString
  279. result = ""
  280. for rune in runes:
  281. result.add rune
  282. proc runeOffset*(s: string, pos: Natural, start: Natural = 0): int =
  283. ## Returns the byte position of rune
  284. ## at position ``pos`` in ``s`` with an optional start byte position.
  285. ## Returns the special value -1 if it runs out of the string.
  286. ##
  287. ## **Beware:** This can lead to unoptimized code and slow execution!
  288. ## Most problems can be solved more efficiently by using an iterator
  289. ## or conversion to a seq of Rune.
  290. ##
  291. ## See also:
  292. ## * `runeReverseOffset proc <#runeReverseOffset,string,Positive>`_
  293. runnableExamples:
  294. let a = "añyóng"
  295. doAssert a.runeOffset(1) == 1
  296. doAssert a.runeOffset(3) == 4
  297. doAssert a.runeOffset(4) == 6
  298. var
  299. i = 0
  300. o = start
  301. while i < pos:
  302. o += runeLenAt(s, o)
  303. if o >= s.len:
  304. return -1
  305. inc i
  306. return o
  307. proc runeReverseOffset*(s: string, rev: Positive): (int, int) =
  308. ## Returns a tuple with the byte offset of the
  309. ## rune at position ``rev`` in ``s``, counting
  310. ## from the end (starting with 1) and the total
  311. ## number of runes in the string.
  312. ##
  313. ## Returns a negative value for offset if there are to few runes in
  314. ## the string to satisfy the request.
  315. ##
  316. ## **Beware:** This can lead to unoptimized code and slow execution!
  317. ## Most problems can be solved more efficiently by using an iterator
  318. ## or conversion to a seq of Rune.
  319. ##
  320. ## See also:
  321. ## * `runeOffset proc <#runeOffset,string,Natural,Natural>`_
  322. var
  323. a = rev.int
  324. o = 0
  325. x = 0
  326. while o < s.len:
  327. let r = runeLenAt(s, o)
  328. o += r
  329. if a < 0:
  330. x += r
  331. dec a
  332. if a > 0:
  333. return (-a, rev.int-a)
  334. return (x, -a+rev.int)
  335. proc runeAtPos*(s: string, pos: int): Rune =
  336. ## Returns the rune at position ``pos``.
  337. ##
  338. ## **Beware:** This can lead to unoptimized code and slow execution!
  339. ## Most problems can be solved more efficiently by using an iterator
  340. ## or conversion to a seq of Rune.
  341. ##
  342. ## See also:
  343. ## * `runeAt proc <#runeAt,string,Natural>`_
  344. ## * `runeStrAtPos proc <#runeStrAtPos,string,Natural>`_
  345. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  346. fastRuneAt(s, runeOffset(s, pos), result, false)
  347. proc runeStrAtPos*(s: string, pos: Natural): string =
  348. ## Returns the rune at position ``pos`` as UTF8 String.
  349. ##
  350. ## **Beware:** This can lead to unoptimized code and slow execution!
  351. ## Most problems can be solved more efficiently by using an iterator
  352. ## or conversion to a seq of Rune.
  353. ##
  354. ## See also:
  355. ## * `runeAt proc <#runeAt,string,Natural>`_
  356. ## * `runeAtPos proc <#runeAtPos,string,int>`_
  357. ## * `fastRuneAt template <#fastRuneAt.t,string,int,untyped>`_
  358. let o = runeOffset(s, pos)
  359. s[o .. (o+runeLenAt(s, o)-1)]
  360. proc runeSubStr*(s: string, pos: int, len: int = int.high): string =
  361. ## Returns the UTF-8 substring starting at code point ``pos``
  362. ## with ``len`` code points.
  363. ##
  364. ## If ``pos`` or ``len`` is negative they count from
  365. ## the end of the string. If ``len`` is not given it means the longest
  366. ## possible string.
  367. runnableExamples:
  368. let s = "Hänsel ««: 10,00€"
  369. doAssert(runeSubStr(s, 0, 2) == "Hä")
  370. doAssert(runeSubStr(s, 10, 1) == ":")
  371. doAssert(runeSubStr(s, -6) == "10,00€")
  372. doAssert(runeSubStr(s, 10) == ": 10,00€")
  373. doAssert(runeSubStr(s, 12, 5) == "10,00")
  374. doAssert(runeSubStr(s, -6, 3) == "10,")
  375. if pos < 0:
  376. let (o, rl) = runeReverseOffset(s, -pos)
  377. if len >= rl:
  378. result = s.substr(o, s.len-1)
  379. elif len < 0:
  380. let e = rl + len
  381. if e < 0:
  382. result = ""
  383. else:
  384. result = s.substr(o, runeOffset(s, e-(rl+pos), o)-1)
  385. else:
  386. result = s.substr(o, runeOffset(s, len, o)-1)
  387. else:
  388. let o = runeOffset(s, pos)
  389. if o < 0:
  390. result = ""
  391. elif len == int.high:
  392. result = s.substr(o, s.len-1)
  393. elif len < 0:
  394. let (e, rl) = runeReverseOffset(s, -len)
  395. discard rl
  396. if e <= 0:
  397. result = ""
  398. else:
  399. result = s.substr(o, e-1)
  400. else:
  401. var e = runeOffset(s, len, o)
  402. if e < 0:
  403. e = s.len
  404. result = s.substr(o, e-1)
  405. proc `<=%`*(a, b: Rune): bool =
  406. ## Checks if code point of `a` is smaller or equal to code point of `b`.
  407. runnableExamples:
  408. let
  409. a = "ú".runeAt(0)
  410. b = "ü".runeAt(0)
  411. doAssert a <=% b
  412. return int(a) <=% int(b)
  413. proc `<%`*(a, b: Rune): bool =
  414. ## Checks if code point of `a` is smaller than code point of `b`.
  415. runnableExamples:
  416. let
  417. a = "ú".runeAt(0)
  418. b = "ü".runeAt(0)
  419. doAssert a <% b
  420. return int(a) <% int(b)
  421. proc `==`*(a, b: Rune): bool =
  422. ## Checks if two runes are equal.
  423. return int(a) == int(b)
  424. include "includes/unicode_ranges"
  425. proc binarySearch(c: RuneImpl, tab: openArray[int], len, stride: int): int =
  426. var n = len
  427. var t = 0
  428. while n > 1:
  429. var m = n div 2
  430. var p = t + m*stride
  431. if c >= tab[p]:
  432. t = p
  433. n = n-m
  434. else:
  435. n = m
  436. if n != 0 and c >= tab[t]:
  437. return t
  438. return -1
  439. proc toLower*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
  440. ## Converts ``c`` into lower case. This works for any rune.
  441. ##
  442. ## If possible, prefer ``toLower`` over ``toUpper``.
  443. ##
  444. ## See also:
  445. ## * `toUpper proc <#toUpper,Rune>`_
  446. ## * `toTitle proc <#toTitle,Rune>`_
  447. ## * `isLower proc <#isLower,Rune>`_
  448. var c = RuneImpl(c)
  449. var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3)
  450. if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]:
  451. return Rune(c + toLowerRanges[p+2] - 500)
  452. p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2)
  453. if p >= 0 and c == toLowerSinglets[p]:
  454. return Rune(c + toLowerSinglets[p+1] - 500)
  455. return Rune(c)
  456. proc toUpper*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
  457. ## Converts ``c`` into upper case. This works for any rune.
  458. ##
  459. ## If possible, prefer ``toLower`` over ``toUpper``.
  460. ##
  461. ## See also:
  462. ## * `toLower proc <#toLower,Rune>`_
  463. ## * `toTitle proc <#toTitle,Rune>`_
  464. ## * `isUpper proc <#isUpper,Rune>`_
  465. var c = RuneImpl(c)
  466. var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3)
  467. if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]:
  468. return Rune(c + toUpperRanges[p+2] - 500)
  469. p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2)
  470. if p >= 0 and c == toUpperSinglets[p]:
  471. return Rune(c + toUpperSinglets[p+1] - 500)
  472. return Rune(c)
  473. proc toTitle*(c: Rune): Rune {.rtl, extern: "nuc$1".} =
  474. ## Converts ``c`` to title case.
  475. ##
  476. ## See also:
  477. ## * `toLower proc <#toLower,Rune>`_
  478. ## * `toUpper proc <#toUpper,Rune>`_
  479. ## * `isTitle proc <#isTitle,Rune>`_
  480. var c = RuneImpl(c)
  481. var p = binarySearch(c, toTitleSinglets, len(toTitleSinglets) div 2, 2)
  482. if p >= 0 and c == toTitleSinglets[p]:
  483. return Rune(c + toTitleSinglets[p+1] - 500)
  484. return Rune(c)
  485. proc isLower*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  486. ## Returns true if ``c`` is a lower case rune.
  487. ##
  488. ## If possible, prefer ``isLower`` over ``isUpper``.
  489. ##
  490. ## See also:
  491. ## * `toLower proc <#toLower,Rune>`_
  492. ## * `isUpper proc <#isUpper,Rune>`_
  493. ## * `isTitle proc <#isTitle,Rune>`_
  494. var c = RuneImpl(c)
  495. # Note: toUpperRanges is correct here!
  496. var p = binarySearch(c, toUpperRanges, len(toUpperRanges) div 3, 3)
  497. if p >= 0 and c >= toUpperRanges[p] and c <= toUpperRanges[p+1]:
  498. return true
  499. p = binarySearch(c, toUpperSinglets, len(toUpperSinglets) div 2, 2)
  500. if p >= 0 and c == toUpperSinglets[p]:
  501. return true
  502. proc isUpper*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  503. ## Returns true if ``c`` is a upper case rune.
  504. ##
  505. ## If possible, prefer ``isLower`` over ``isUpper``.
  506. ##
  507. ## See also:
  508. ## * `toUpper proc <#toUpper,Rune>`_
  509. ## * `isLower proc <#isLower,Rune>`_
  510. ## * `isTitle proc <#isTitle,Rune>`_
  511. ## * `isAlpha proc <#isAlpha,Rune>`_
  512. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  513. var c = RuneImpl(c)
  514. # Note: toLowerRanges is correct here!
  515. var p = binarySearch(c, toLowerRanges, len(toLowerRanges) div 3, 3)
  516. if p >= 0 and c >= toLowerRanges[p] and c <= toLowerRanges[p+1]:
  517. return true
  518. p = binarySearch(c, toLowerSinglets, len(toLowerSinglets) div 2, 2)
  519. if p >= 0 and c == toLowerSinglets[p]:
  520. return true
  521. proc isAlpha*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  522. ## Returns true if ``c`` is an *alpha* rune (i.e., a letter).
  523. ##
  524. ## See also:
  525. ## * `isLower proc <#isLower,Rune>`_
  526. ## * `isTitle proc <#isTitle,Rune>`_
  527. ## * `isAlpha proc <#isAlpha,Rune>`_
  528. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  529. ## * `isCombining proc <#isCombining,Rune>`_
  530. if isUpper(c) or isLower(c):
  531. return true
  532. var c = RuneImpl(c)
  533. var p = binarySearch(c, alphaRanges, len(alphaRanges) div 2, 2)
  534. if p >= 0 and c >= alphaRanges[p] and c <= alphaRanges[p+1]:
  535. return true
  536. p = binarySearch(c, alphaSinglets, len(alphaSinglets), 1)
  537. if p >= 0 and c == alphaSinglets[p]:
  538. return true
  539. proc isTitle*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  540. ## Returns true if ``c`` is a Unicode titlecase code point.
  541. ##
  542. ## See also:
  543. ## * `toTitle proc <#toTitle,Rune>`_
  544. ## * `isLower proc <#isLower,Rune>`_
  545. ## * `isUpper proc <#isUpper,Rune>`_
  546. ## * `isAlpha proc <#isAlpha,Rune>`_
  547. ## * `isWhiteSpace proc <#isWhiteSpace,Rune>`_
  548. return isUpper(c) and isLower(c)
  549. proc isWhiteSpace*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  550. ## Returns true if ``c`` is a Unicode whitespace code point.
  551. ##
  552. ## See also:
  553. ## * `isLower proc <#isLower,Rune>`_
  554. ## * `isUpper proc <#isUpper,Rune>`_
  555. ## * `isTitle proc <#isTitle,Rune>`_
  556. ## * `isAlpha proc <#isAlpha,Rune>`_
  557. var c = RuneImpl(c)
  558. var p = binarySearch(c, spaceRanges, len(spaceRanges) div 2, 2)
  559. if p >= 0 and c >= spaceRanges[p] and c <= spaceRanges[p+1]:
  560. return true
  561. proc isCombining*(c: Rune): bool {.rtl, extern: "nuc$1".} =
  562. ## Returns true if ``c`` is a Unicode combining code unit.
  563. ##
  564. ## See also:
  565. ## * `isLower proc <#isLower,Rune>`_
  566. ## * `isUpper proc <#isUpper,Rune>`_
  567. ## * `isTitle proc <#isTitle,Rune>`_
  568. ## * `isAlpha proc <#isAlpha,Rune>`_
  569. var c = RuneImpl(c)
  570. # Optimized to return false immediately for ASCII
  571. return c >= 0x0300 and (c <= 0x036f or
  572. (c >= 0x1ab0 and c <= 0x1aff) or
  573. (c >= 0x1dc0 and c <= 0x1dff) or
  574. (c >= 0x20d0 and c <= 0x20ff) or
  575. (c >= 0xfe20 and c <= 0xfe2f))
  576. template runeCheck(s, runeProc) =
  577. ## Common code for isAlpha and isSpace.
  578. result = if len(s) == 0: false else: true
  579. var
  580. i = 0
  581. rune: Rune
  582. while i < len(s) and result:
  583. fastRuneAt(s, i, rune, doInc = true)
  584. result = runeProc(rune) and result
  585. proc isAlpha*(s: string): bool {.noSideEffect,
  586. rtl, extern: "nuc$1Str".} =
  587. ## Returns true if ``s`` contains all alphabetic runes.
  588. runnableExamples:
  589. let a = "añyóng"
  590. doAssert a.isAlpha
  591. runeCheck(s, isAlpha)
  592. proc isSpace*(s: string): bool {.noSideEffect,
  593. rtl, extern: "nuc$1Str".} =
  594. ## Returns true if ``s`` contains all whitespace runes.
  595. runnableExamples:
  596. let a = "\t\l \v\r\f"
  597. doAssert a.isSpace
  598. runeCheck(s, isWhiteSpace)
  599. template convertRune(s, runeProc) =
  600. ## Convert runes in ``s`` using ``runeProc`` as the converter.
  601. result = newString(len(s))
  602. var
  603. i = 0
  604. resultIndex = 0
  605. rune: Rune
  606. while i < len(s):
  607. fastRuneAt(s, i, rune, doInc = true)
  608. rune = runeProc(rune)
  609. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  610. proc toUpper*(s: string): string {.noSideEffect,
  611. rtl, extern: "nuc$1Str".} =
  612. ## Converts ``s`` into upper-case runes.
  613. runnableExamples:
  614. doAssert toUpper("abγ") == "ABΓ"
  615. convertRune(s, toUpper)
  616. proc toLower*(s: string): string {.noSideEffect,
  617. rtl, extern: "nuc$1Str".} =
  618. ## Converts ``s`` into lower-case runes.
  619. runnableExamples:
  620. doAssert toLower("ABΓ") == "abγ"
  621. convertRune(s, toLower)
  622. proc swapCase*(s: string): string {.noSideEffect,
  623. rtl, extern: "nuc$1".} =
  624. ## Swaps the case of runes in ``s``.
  625. ##
  626. ## Returns a new string such that the cases of all runes
  627. ## are swapped if possible.
  628. runnableExamples:
  629. doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
  630. var
  631. i = 0
  632. resultIndex = 0
  633. rune: Rune
  634. result = newString(len(s))
  635. while i < len(s):
  636. fastRuneAt(s, i, rune)
  637. if rune.isUpper():
  638. rune = rune.toLower()
  639. elif rune.isLower():
  640. rune = rune.toUpper()
  641. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  642. proc capitalize*(s: string): string {.noSideEffect,
  643. rtl, extern: "nuc$1".} =
  644. ## Converts the first character of ``s`` into an upper-case rune.
  645. runnableExamples:
  646. doAssert capitalize("βeta") == "Βeta"
  647. if len(s) == 0:
  648. return ""
  649. var
  650. rune: Rune
  651. i = 0
  652. fastRuneAt(s, i, rune, doInc = true)
  653. result = $toUpper(rune) & substr(s, i)
  654. proc translate*(s: string, replacements: proc(key: string): string): string {.
  655. rtl, extern: "nuc$1".} =
  656. ## Translates words in a string using the ``replacements`` proc to substitute
  657. ## words inside ``s`` with their replacements.
  658. ##
  659. ## ``replacements`` is any proc that takes a word and returns
  660. ## a new word to fill it's place.
  661. runnableExamples:
  662. proc wordToNumber(s: string): string =
  663. case s
  664. of "one": "1"
  665. of "two": "2"
  666. else: s
  667. let a = "one two three four"
  668. doAssert a.translate(wordToNumber) == "1 2 three four"
  669. # Allocate memory for the new string based on the old one.
  670. # If the new string length is less than the old, no allocations
  671. # will be needed. If the new string length is greater than the
  672. # old, then maybe only one allocation is needed
  673. result = newStringOfCap(s.len)
  674. var
  675. index = 0
  676. lastIndex = 0
  677. wordStart = 0
  678. inWord = false
  679. rune: Rune
  680. while index < len(s):
  681. lastIndex = index
  682. fastRuneAt(s, index, rune)
  683. let whiteSpace = rune.isWhiteSpace()
  684. if whiteSpace and inWord:
  685. # If we've reached the end of a word
  686. let word = s[wordStart ..< lastIndex]
  687. result.add(replacements(word))
  688. result.add($rune)
  689. inWord = false
  690. elif not whiteSpace and not inWord:
  691. # If we've hit a non space character and
  692. # are not currently in a word, track
  693. # the starting index of the word
  694. inWord = true
  695. wordStart = lastIndex
  696. elif whiteSpace:
  697. result.add($rune)
  698. if wordStart < len(s) and inWord:
  699. # Get the trailing word at the end
  700. let word = s[wordStart .. ^1]
  701. result.add(replacements(word))
  702. proc title*(s: string): string {.noSideEffect,
  703. rtl, extern: "nuc$1".} =
  704. ## Converts ``s`` to a unicode title.
  705. ##
  706. ## Returns a new string such that the first character
  707. ## in each word inside ``s`` is capitalized.
  708. runnableExamples:
  709. doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
  710. var
  711. i = 0
  712. resultIndex = 0
  713. rune: Rune
  714. result = newString(len(s))
  715. var firstRune = true
  716. while i < len(s):
  717. fastRuneAt(s, i, rune)
  718. if not rune.isWhiteSpace() and firstRune:
  719. rune = rune.toUpper()
  720. firstRune = false
  721. elif rune.isWhiteSpace():
  722. firstRune = true
  723. fastToUTF8Copy(rune, result, resultIndex, doInc = true)
  724. iterator runes*(s: string): Rune =
  725. ## Iterates over any rune of the string ``s`` returning runes.
  726. var
  727. i = 0
  728. result: Rune
  729. while i < len(s):
  730. fastRuneAt(s, i, result, true)
  731. yield result
  732. iterator utf8*(s: string): string =
  733. ## Iterates over any rune of the string ``s`` returning utf8 values.
  734. ##
  735. ## See also:
  736. ## * `validateUtf8 proc <#validateUtf8,string>`_
  737. ## * `toUTF8 proc <#toUTF8,Rune>`_
  738. ## * `$ proc <#$,Rune>`_ alias for `toUTF8`
  739. ## * `fastToUTF8Copy template <#fastToUTF8Copy.t,Rune,string,int>`_
  740. var o = 0
  741. while o < s.len:
  742. let n = runeLenAt(s, o)
  743. yield s[o .. (o+n-1)]
  744. o += n
  745. proc toRunes*(s: string): seq[Rune] =
  746. ## Obtains a sequence containing the Runes in ``s``.
  747. ##
  748. ## See also:
  749. ## * `$ proc <#$,seq[T][Rune]>`_ for a reverse operation
  750. runnableExamples:
  751. let a = toRunes("aáä")
  752. doAssert a == @["a".runeAt(0), "á".runeAt(0), "ä".runeAt(0)]
  753. result = newSeq[Rune]()
  754. for r in s.runes:
  755. result.add(r)
  756. proc cmpRunesIgnoreCase*(a, b: string): int {.rtl, extern: "nuc$1".} =
  757. ## Compares two UTF-8 strings and ignores the case. Returns:
  758. ##
  759. ## | 0 if a == b
  760. ## | < 0 if a < b
  761. ## | > 0 if a > b
  762. var i = 0
  763. var j = 0
  764. var ar, br: Rune
  765. while i < a.len and j < b.len:
  766. # slow path:
  767. fastRuneAt(a, i, ar)
  768. fastRuneAt(b, j, br)
  769. result = RuneImpl(toLower(ar)) - RuneImpl(toLower(br))
  770. if result != 0: return
  771. result = a.len - b.len
  772. proc reversed*(s: string): string =
  773. ## Returns the reverse of ``s``, interpreting it as runes.
  774. ##
  775. ## Unicode combining characters are correctly interpreted as well.
  776. runnableExamples:
  777. assert reversed("Reverse this!") == "!siht esreveR"
  778. assert reversed("先秦兩漢") == "漢兩秦先"
  779. assert reversed("as⃝df̅") == "f̅ds⃝a"
  780. assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  781. var
  782. i = 0
  783. lastI = 0
  784. newPos = len(s) - 1
  785. blockPos = 0
  786. r: Rune
  787. template reverseUntil(pos) =
  788. var j = pos - 1
  789. while j > blockPos:
  790. result[newPos] = s[j]
  791. dec j
  792. dec newPos
  793. blockPos = pos - 1
  794. result = newString(len(s))
  795. while i < len(s):
  796. lastI = i
  797. fastRuneAt(s, i, r, true)
  798. if not isCombining(r):
  799. reverseUntil(lastI)
  800. reverseUntil(len(s))
  801. proc graphemeLen*(s: string; i: Natural): Natural =
  802. ## The number of bytes belonging to byte index ``s[i]``,
  803. ## including following combining code unit.
  804. runnableExamples:
  805. let a = "añyóng"
  806. doAssert a.graphemeLen(1) == 2 ## ñ
  807. doAssert a.graphemeLen(2) == 1
  808. doAssert a.graphemeLen(4) == 2 ## ó
  809. var j = i.int
  810. var r, r2: Rune
  811. if j < s.len:
  812. fastRuneAt(s, j, r, true)
  813. result = j-i
  814. while j < s.len:
  815. fastRuneAt(s, j, r2, true)
  816. if not isCombining(r2): break
  817. result = j-i
  818. proc lastRune*(s: string; last: int): (Rune, int) =
  819. ## Length of the last rune in ``s[0..last]``. Returns the rune and its length
  820. ## in bytes.
  821. if s[last] <= chr(127):
  822. result = (Rune(s[last]), 1)
  823. else:
  824. var L = 0
  825. while last-L >= 0 and uint(s[last-L]) shr 6 == 0b10: inc(L)
  826. var r: Rune
  827. fastRuneAt(s, last-L, r, false)
  828. result = (r, L+1)
  829. proc size*(r: Rune): int {.noSideEffect.} =
  830. ## Returns the number of bytes the rune ``r`` takes.
  831. runnableExamples:
  832. let a = toRunes "aá"
  833. doAssert size(a[0]) == 1
  834. doAssert size(a[1]) == 2
  835. let v = r.uint32
  836. if v <= 0x007F'u32: result = 1
  837. elif v <= 0x07FF'u32: result = 2
  838. elif v <= 0xFFFF'u32: result = 3
  839. elif v <= 0x1FFFFF'u32: result = 4
  840. elif v <= 0x3FFFFFF'u32: result = 5
  841. elif v <= 0x7FFFFFFF'u32: result = 6
  842. else: result = 1
  843. # --------- Private templates for different split separators -----------
  844. proc stringHasSep(s: string, index: int, seps: openArray[Rune]): bool =
  845. var rune: Rune
  846. fastRuneAt(s, index, rune, false)
  847. return seps.contains(rune)
  848. proc stringHasSep(s: string, index: int, sep: Rune): bool =
  849. var rune: Rune
  850. fastRuneAt(s, index, rune, false)
  851. return sep == rune
  852. template splitCommon(s, sep, maxsplit: untyped) =
  853. ## Common code for split procedures.
  854. let
  855. sLen = len(s)
  856. var
  857. last = 0
  858. splits = maxsplit
  859. if sLen > 0:
  860. while last <= sLen:
  861. var first = last
  862. while last < sLen and not stringHasSep(s, last, sep):
  863. inc(last, runeLenAt(s, last))
  864. if splits == 0: last = sLen
  865. yield s[first .. (last - 1)]
  866. if splits == 0: break
  867. dec(splits)
  868. inc(last, if last < sLen: runeLenAt(s, last) else: 1)
  869. iterator split*(s: string, seps: openArray[Rune] = unicodeSpaces,
  870. maxsplit: int = -1): string =
  871. ## Splits the unicode string ``s`` into substrings using a group of separators.
  872. ##
  873. ## Substrings are separated by a substring containing only ``seps``.
  874. ##
  875. ## .. code-block:: nim
  876. ## for word in split("this\lis an\texample"):
  877. ## writeLine(stdout, word)
  878. ##
  879. ## ...generates this output:
  880. ##
  881. ## .. code-block::
  882. ## "this"
  883. ## "is"
  884. ## "an"
  885. ## "example"
  886. ##
  887. ## And the following code:
  888. ##
  889. ## .. code-block:: nim
  890. ## for word in split("this:is;an$example", {';', ':', '$'}):
  891. ## writeLine(stdout, word)
  892. ##
  893. ## ...produces the same output as the first example. The code:
  894. ##
  895. ## .. code-block:: nim
  896. ## let date = "2012-11-20T22:08:08.398990"
  897. ## let separators = {' ', '-', ':', 'T'}
  898. ## for number in split(date, separators):
  899. ## writeLine(stdout, number)
  900. ##
  901. ## ...results in:
  902. ##
  903. ## .. code-block::
  904. ## "2012"
  905. ## "11"
  906. ## "20"
  907. ## "22"
  908. ## "08"
  909. ## "08.398990"
  910. ##
  911. splitCommon(s, seps, maxsplit)
  912. iterator splitWhitespace*(s: string): string =
  913. ## Splits a unicode string at whitespace runes.
  914. splitCommon(s, unicodeSpaces, -1)
  915. template accResult(iter: untyped) =
  916. result = @[]
  917. for x in iter: add(result, x)
  918. proc splitWhitespace*(s: string): seq[string] {.noSideEffect,
  919. rtl, extern: "ncuSplitWhitespace".} =
  920. ## The same as the `splitWhitespace <#splitWhitespace.i,string>`_
  921. ## iterator, but is a proc that returns a sequence of substrings.
  922. accResult(splitWhitespace(s))
  923. iterator split*(s: string, sep: Rune, maxsplit: int = -1): string =
  924. ## Splits the unicode string ``s`` into substrings using a single separator.
  925. ##
  926. ## Substrings are separated by the rune ``sep``.
  927. ## The code:
  928. ##
  929. ## .. code-block:: nim
  930. ## for word in split(";;this;is;an;;example;;;", ';'):
  931. ## writeLine(stdout, word)
  932. ##
  933. ## Results in:
  934. ##
  935. ## .. code-block::
  936. ## ""
  937. ## ""
  938. ## "this"
  939. ## "is"
  940. ## "an"
  941. ## ""
  942. ## "example"
  943. ## ""
  944. ## ""
  945. ## ""
  946. ##
  947. splitCommon(s, sep, maxsplit)
  948. proc split*(s: string, seps: openArray[Rune] = unicodeSpaces, maxsplit: int = -1):
  949. seq[string] {.noSideEffect, rtl, extern: "nucSplitRunes".} =
  950. ## The same as the `split iterator <#split.i,string,openArray[Rune],int>`_,
  951. ## but is a proc that returns a sequence of substrings.
  952. accResult(split(s, seps, maxsplit))
  953. proc split*(s: string, sep: Rune, maxsplit: int = -1): seq[string] {.noSideEffect,
  954. rtl, extern: "nucSplitRune".} =
  955. ## The same as the `split iterator <#split.i,string,Rune,int>`_, but is a proc
  956. ## that returns a sequence of substrings.
  957. accResult(split(s, sep, maxsplit))
  958. proc strip*(s: string, leading = true, trailing = true,
  959. runes: openArray[Rune] = unicodeSpaces): string {.noSideEffect,
  960. rtl, extern: "nucStrip".} =
  961. ## Strips leading or trailing ``runes`` from ``s`` and returns
  962. ## the resulting string.
  963. ##
  964. ## If ``leading`` is true (default), leading ``runes`` are stripped.
  965. ## If ``trailing`` is true (default), trailing ``runes`` are stripped.
  966. ## If both are false, the string is returned unchanged.
  967. runnableExamples:
  968. let a = "\táñyóng "
  969. doAssert a.strip == "áñyóng"
  970. doAssert a.strip(leading = false) == "\táñyóng"
  971. doAssert a.strip(trailing = false) == "áñyóng "
  972. var
  973. sI = 0 ## starting index into string ``s``
  974. eI = len(s) - 1 ## ending index into ``s``, where the last ``Rune`` starts
  975. if leading:
  976. var
  977. i = 0
  978. xI: int ## value of ``sI`` at the beginning of the iteration
  979. rune: Rune
  980. while i < len(s):
  981. xI = i
  982. fastRuneAt(s, i, rune)
  983. sI = i # Assume to start from next rune
  984. if not runes.contains(rune):
  985. sI = xI # Go back to where the current rune starts
  986. break
  987. if trailing:
  988. var
  989. i = eI
  990. xI: int
  991. rune: Rune
  992. while i >= 0:
  993. xI = i
  994. fastRuneAt(s, xI, rune)
  995. var yI = i - 1
  996. while yI >= 0:
  997. var
  998. yIend = yI
  999. pRune: Rune
  1000. fastRuneAt(s, yIend, pRune)
  1001. if yIend < xI: break
  1002. i = yI
  1003. rune = pRune
  1004. dec(yI)
  1005. if not runes.contains(rune):
  1006. eI = xI - 1
  1007. break
  1008. dec(i)
  1009. let newLen = eI - sI + 1
  1010. result = newStringOfCap(newLen)
  1011. if newLen > 0:
  1012. result.add s[sI .. eI]
  1013. proc repeat*(c: Rune, count: Natural): string {.noSideEffect,
  1014. rtl, extern: "nucRepeatRune".} =
  1015. ## Returns a string of ``count`` Runes ``c``.
  1016. ##
  1017. ## The returned string will have a rune-length of ``count``.
  1018. runnableExamples:
  1019. let a = "ñ".runeAt(0)
  1020. doAssert a.repeat(5) == "ñññññ"
  1021. let s = $c
  1022. result = newStringOfCap(count * s.len)
  1023. for i in 0 ..< count:
  1024. result.add s
  1025. proc align*(s: string, count: Natural, padding = ' '.Rune): string {.
  1026. noSideEffect, rtl, extern: "nucAlignString".} =
  1027. ## Aligns a unicode string ``s`` with ``padding``, so that it has a rune-length
  1028. ## of ``count``.
  1029. ##
  1030. ## ``padding`` characters (by default spaces) are added before ``s`` resulting in
  1031. ## right alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1032. ## returned unchanged. If you need to left align a string use the `alignLeft
  1033. ## proc <#alignLeft,string,Natural>`_.
  1034. runnableExamples:
  1035. assert align("abc", 4) == " abc"
  1036. assert align("a", 0) == "a"
  1037. assert align("1232", 6) == " 1232"
  1038. assert align("1232", 6, '#'.Rune) == "##1232"
  1039. assert align("Åge", 5) == " Åge"
  1040. assert align("×", 4, '_'.Rune) == "___×"
  1041. let sLen = s.runeLen
  1042. if sLen < count:
  1043. let padStr = $padding
  1044. result = newStringOfCap(padStr.len * count)
  1045. let spaces = count - sLen
  1046. for i in 0 ..< spaces: result.add padStr
  1047. result.add s
  1048. else:
  1049. result = s
  1050. proc alignLeft*(s: string, count: Natural, padding = ' '.Rune): string {.
  1051. noSideEffect.} =
  1052. ## Left-aligns a unicode string ``s`` with ``padding``, so that it has a
  1053. ## rune-length of ``count``.
  1054. ##
  1055. ## ``padding`` characters (by default spaces) are added after ``s`` resulting in
  1056. ## left alignment. If ``s.runelen >= count``, no spaces are added and ``s`` is
  1057. ## returned unchanged. If you need to right align a string use the `align
  1058. ## proc <#align,string,Natural>`_.
  1059. runnableExamples:
  1060. assert alignLeft("abc", 4) == "abc "
  1061. assert alignLeft("a", 0) == "a"
  1062. assert alignLeft("1232", 6) == "1232 "
  1063. assert alignLeft("1232", 6, '#'.Rune) == "1232##"
  1064. assert alignLeft("Åge", 5) == "Åge "
  1065. assert alignLeft("×", 4, '_'.Rune) == "×___"
  1066. let sLen = s.runeLen
  1067. if sLen < count:
  1068. let padStr = $padding
  1069. result = newStringOfCap(s.len + (count - sLen) * padStr.len)
  1070. result.add s
  1071. for i in sLen ..< count:
  1072. result.add padStr
  1073. else:
  1074. result = s
  1075. when isMainModule:
  1076. proc asRune(s: static[string]): Rune =
  1077. ## Compile-time conversion proc for converting string literals to a Rune
  1078. ## value. Returns the first Rune of the specified string.
  1079. ##
  1080. ## Shortcuts code like ``"å".runeAt(0)`` to ``"å".asRune`` and returns a
  1081. ## compile-time constant.
  1082. if s.len == 0: Rune(0)
  1083. else: s.runeAt(0)
  1084. let
  1085. someString = "öÑ"
  1086. someRunes = toRunes(someString)
  1087. compared = (someString == $someRunes)
  1088. doAssert compared == true
  1089. proc testReplacements(word: string): string =
  1090. case word
  1091. of "two":
  1092. return "2"
  1093. of "foo":
  1094. return "BAR"
  1095. of "βeta":
  1096. return "beta"
  1097. of "alpha":
  1098. return "αlpha"
  1099. else:
  1100. return "12345"
  1101. doAssert translate("two not alpha foo βeta", testReplacements) == "2 12345 αlpha BAR beta"
  1102. doAssert translate(" two not foo βeta ", testReplacements) == " 2 12345 BAR beta "
  1103. doAssert title("foo bar") == "Foo Bar"
  1104. doAssert title("αlpha βeta γamma") == "Αlpha Βeta Γamma"
  1105. doAssert title("") == ""
  1106. doAssert capitalize("βeta") == "Βeta"
  1107. doAssert capitalize("foo") == "Foo"
  1108. doAssert capitalize("") == ""
  1109. doAssert swapCase("FooBar") == "fOObAR"
  1110. doAssert swapCase(" ") == " "
  1111. doAssert swapCase("Αlpha Βeta Γamma") == "αLPHA βETA γAMMA"
  1112. doAssert swapCase("a✓B") == "A✓b"
  1113. doAssert swapCase("Јамогујестистаклоитоминештети") == "јАМОГУЈЕСТИСТАКЛОИТОМИНЕШТЕТИ"
  1114. doAssert swapCase("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει") == "ὝΑΛΟΝΦΑΓΕῖΝΔΎΝΑΜΑΙΤΟῦΤΟΟὔΜΕΒΛΆΠΤΕΙ"
  1115. doAssert swapCase("Կրնամապակիուտեևինծիանհանգիստչըներ") == "կՐՆԱՄԱՊԱԿԻՈՒՏԵևԻՆԾԻԱՆՀԱՆԳԻՍՏՉԸՆԵՐ"
  1116. doAssert swapCase("") == ""
  1117. doAssert isAlpha("r")
  1118. doAssert isAlpha("α")
  1119. doAssert isAlpha("ϙ")
  1120. doAssert isAlpha("ஶ")
  1121. doAssert(not isAlpha("$"))
  1122. doAssert(not isAlpha(""))
  1123. doAssert isAlpha("Βeta")
  1124. doAssert isAlpha("Args")
  1125. doAssert isAlpha("𐌼𐌰𐌲𐌲𐌻𐌴𐍃𐍄𐌰𐌽")
  1126. doAssert isAlpha("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει")
  1127. doAssert isAlpha("Јамогујестистаклоитоминештети")
  1128. doAssert isAlpha("Կրնամապակիուտեևինծիանհանգիստչըներ")
  1129. doAssert(not isAlpha("$Foo✓"))
  1130. doAssert(not isAlpha("⠙⠕⠑⠎⠝⠞"))
  1131. doAssert isSpace("\t")
  1132. doAssert isSpace("\l")
  1133. doAssert(not isSpace("Β"))
  1134. doAssert(not isSpace("Βeta"))
  1135. doAssert isSpace("\t\l \v\r\f")
  1136. doAssert isSpace(" ")
  1137. doAssert(not isSpace(""))
  1138. doAssert(not isSpace("ΑΓc \td"))
  1139. doAssert(not isLower(' '.Rune))
  1140. doAssert(not isUpper(' '.Rune))
  1141. doAssert toUpper("Γ") == "Γ"
  1142. doAssert toUpper("b") == "B"
  1143. doAssert toUpper("α") == "Α"
  1144. doAssert toUpper("✓") == "✓"
  1145. doAssert toUpper("ϙ") == "Ϙ"
  1146. doAssert toUpper("") == ""
  1147. doAssert toUpper("ΑΒΓ") == "ΑΒΓ"
  1148. doAssert toUpper("AAccβ") == "AACCΒ"
  1149. doAssert toUpper("A✓$β") == "A✓$Β"
  1150. doAssert toLower("a") == "a"
  1151. doAssert toLower("γ") == "γ"
  1152. doAssert toLower("Γ") == "γ"
  1153. doAssert toLower("4") == "4"
  1154. doAssert toLower("Ϙ") == "ϙ"
  1155. doAssert toLower("") == ""
  1156. doAssert toLower("abcdγ") == "abcdγ"
  1157. doAssert toLower("abCDΓ") == "abcdγ"
  1158. doAssert toLower("33aaΓ") == "33aaγ"
  1159. doAssert reversed("Reverse this!") == "!siht esreveR"
  1160. doAssert reversed("先秦兩漢") == "漢兩秦先"
  1161. doAssert reversed("as⃝df̅") == "f̅ds⃝a"
  1162. doAssert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  1163. doAssert reversed("ὕαλονϕαγεῖνδύναμαιτοῦτοοὔμεβλάπτει") == "ιετπάλβεμὔοοτῦοτιαμανύδνῖεγαϕνολαὕ"
  1164. doAssert reversed("Јамогујестистаклоитоминештети") == "итетшенимотиолкатситсејугомаЈ"
  1165. doAssert reversed("Կրնամապակիուտեևինծիանհանգիստչըներ") == "րենըչտսիգնահնաիծնիևետւոիկապամանրԿ"
  1166. doAssert len(toRunes("as⃝df̅")) == runeLen("as⃝df̅")
  1167. const test = "as⃝"
  1168. doAssert lastRune(test, test.len-1)[1] == 3
  1169. doAssert graphemeLen("è", 0) == 2
  1170. # test for rune positioning and runeSubStr()
  1171. let s = "Hänsel ««: 10,00€"
  1172. var t = ""
  1173. for c in s.utf8:
  1174. t.add c
  1175. doAssert(s == t)
  1176. doAssert(runeReverseOffset(s, 1) == (20, 18))
  1177. doAssert(runeReverseOffset(s, 19) == (-1, 18))
  1178. doAssert(runeStrAtPos(s, 0) == "H")
  1179. doAssert(runeSubStr(s, 0, 1) == "H")
  1180. doAssert(runeStrAtPos(s, 10) == ":")
  1181. doAssert(runeSubStr(s, 10, 1) == ":")
  1182. doAssert(runeStrAtPos(s, 9) == "«")
  1183. doAssert(runeSubStr(s, 9, 1) == "«")
  1184. doAssert(runeStrAtPos(s, 17) == "€")
  1185. doAssert(runeSubStr(s, 17, 1) == "€")
  1186. # echo runeStrAtPos(s, 18) # index error
  1187. doAssert(runeSubStr(s, 0) == "Hänsel ««: 10,00€")
  1188. doAssert(runeSubStr(s, -18) == "Hänsel ««: 10,00€")
  1189. doAssert(runeSubStr(s, 10) == ": 10,00€")
  1190. doAssert(runeSubStr(s, 18) == "")
  1191. doAssert(runeSubStr(s, 0, 10) == "Hänsel ««")
  1192. doAssert(runeSubStr(s, 12) == "10,00€")
  1193. doAssert(runeSubStr(s, -6) == "10,00€")
  1194. doAssert(runeSubStr(s, 12, 5) == "10,00")
  1195. doAssert(runeSubStr(s, 12, -1) == "10,00")
  1196. doAssert(runeSubStr(s, -6, 5) == "10,00")
  1197. doAssert(runeSubStr(s, -6, -1) == "10,00")
  1198. doAssert(runeSubStr(s, 0, 100) == "Hänsel ««: 10,00€")
  1199. doAssert(runeSubStr(s, -100, 100) == "Hänsel ««: 10,00€")
  1200. doAssert(runeSubStr(s, 0, -100) == "")
  1201. doAssert(runeSubStr(s, 100, -100) == "")
  1202. block splitTests:
  1203. let s = " this is an example "
  1204. let s2 = ":this;is;an:example;;"
  1205. let s3 = ":this×is×an:example××"
  1206. doAssert s.split() == @["", "this", "is", "an", "example", "", ""]
  1207. doAssert s2.split(seps = [':'.Rune, ';'.Rune]) == @["", "this", "is", "an",
  1208. "example", "", ""]
  1209. doAssert s3.split(seps = [':'.Rune, "×".asRune]) == @["", "this", "is",
  1210. "an", "example", "", ""]
  1211. doAssert s.split(maxsplit = 4) == @["", "this", "is", "an", "example "]
  1212. doAssert s.split(' '.Rune, maxsplit = 1) == @["", "this is an example "]
  1213. doAssert s3.split("×".runeAt(0)) == @[":this", "is", "an:example", "", ""]
  1214. block stripTests:
  1215. doAssert(strip("") == "")
  1216. doAssert(strip(" ") == "")
  1217. doAssert(strip("y") == "y")
  1218. doAssert(strip(" foofoofoo ") == "foofoofoo")
  1219. doAssert(strip("sfoofoofoos", runes = ['s'.Rune]) == "foofoofoo")
  1220. block:
  1221. let stripTestRunes = ['b'.Rune, 'a'.Rune, 'r'.Rune]
  1222. doAssert(strip("barfoofoofoobar", runes = stripTestRunes) == "foofoofoo")
  1223. doAssert(strip("sfoofoofoos", leading = false, runes = ['s'.Rune]) == "sfoofoofoo")
  1224. doAssert(strip("sfoofoofoos", trailing = false, runes = ['s'.Rune]) == "foofoofoos")
  1225. block:
  1226. let stripTestRunes = ["«".asRune, "»".asRune]
  1227. doAssert(strip("«TEXT»", runes = stripTestRunes) == "TEXT")
  1228. doAssert(strip("copyright©", leading = false, runes = ["©".asRune]) == "copyright")
  1229. doAssert(strip("¿Question?", trailing = false, runes = ["¿".asRune]) == "Question?")
  1230. doAssert(strip("×text×", leading = false, runes = ["×".asRune]) == "×text")
  1231. doAssert(strip("×text×", trailing = false, runes = ["×".asRune]) == "text×")
  1232. block repeatTests:
  1233. doAssert repeat('c'.Rune, 5) == "ccccc"
  1234. doAssert repeat("×".asRune, 5) == "×××××"
  1235. block alignTests:
  1236. doAssert align("abc", 4) == " abc"
  1237. doAssert align("a", 0) == "a"
  1238. doAssert align("1232", 6) == " 1232"
  1239. doAssert align("1232", 6, '#'.Rune) == "##1232"
  1240. doAssert align("1232", 6, "×".asRune) == "××1232"
  1241. doAssert alignLeft("abc", 4) == "abc "
  1242. doAssert alignLeft("a", 0) == "a"
  1243. doAssert alignLeft("1232", 6) == "1232 "
  1244. doAssert alignLeft("1232", 6, '#'.Rune) == "1232##"
  1245. doAssert alignLeft("1232", 6, "×".asRune) == "1232××"
  1246. block differentSizes:
  1247. # upper and lower variants have different number of bytes
  1248. doAssert toLower("AẞC") == "aßc"
  1249. doAssert toLower("ȺẞCD") == "ⱥßcd"
  1250. doAssert toUpper("ⱥbc") == "ȺBC"
  1251. doAssert toUpper("rsⱦuv") == "RSȾUV"
  1252. doAssert swapCase("ⱥbCd") == "ȺBcD"
  1253. doAssert swapCase("XyꟆaB") == "xYᶎAb"
  1254. doAssert swapCase("aᵹcᲈd") == "AꝽCꙊD"