parseutils.nim 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2012 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module contains helpers for parsing tokens, numbers, identifiers, etc.
  10. ##
  11. ## To unpack raw bytes look at the `streams <streams.html>`_ module.
  12. {.deadCodeElim: on.} # dce option deprecated
  13. {.push debugger:off .} # the user does not want to trace a part
  14. # of the standard library!
  15. include "system/inclrtl"
  16. const
  17. Whitespace = {' ', '\t', '\v', '\r', '\l', '\f'}
  18. IdentChars = {'a'..'z', 'A'..'Z', '0'..'9', '_'}
  19. IdentStartChars = {'a'..'z', 'A'..'Z', '_'}
  20. ## copied from strutils
  21. proc toLower(c: char): char {.inline.} =
  22. result = if c in {'A'..'Z'}: chr(ord(c)-ord('A')+ord('a')) else: c
  23. proc parseHex*(s: string, number: var int, start = 0; maxLen = 0): int {.
  24. rtl, extern: "npuParseHex", noSideEffect.} =
  25. ## Parses a hexadecimal number and stores its value in ``number``.
  26. ##
  27. ## Returns the number of the parsed characters or 0 in case of an error. This
  28. ## proc is sensitive to the already existing value of ``number`` and will
  29. ## likely not do what you want unless you make sure ``number`` is zero. You
  30. ## can use this feature to *chain* calls, though the result int will quickly
  31. ## overflow. Example:
  32. ##
  33. ## .. code-block:: nim
  34. ## var value = 0
  35. ## discard parseHex("0x38", value)
  36. ## assert value == 56
  37. ## discard parseHex("0x34", value)
  38. ## assert value == 56 * 256 + 52
  39. ## value = -1
  40. ## discard parseHex("0x38", value)
  41. ## assert value == -200
  42. ##
  43. ## If ``maxLen == 0`` the length of the hexadecimal number has no upper bound.
  44. ## Else no more than ``start + maxLen`` characters are parsed, up to the
  45. ## length of the string.
  46. var i = start
  47. var foundDigit = false
  48. # get last index based on minimum `start + maxLen` or `s.len`
  49. let last = min(s.len, if maxLen == 0: s.len else: i+maxLen)
  50. if i+1 < last and s[i] == '0' and (s[i+1] in {'x', 'X'}): inc(i, 2)
  51. elif i < last and s[i] == '#': inc(i)
  52. while i < last:
  53. case s[i]
  54. of '_': discard
  55. of '0'..'9':
  56. number = number shl 4 or (ord(s[i]) - ord('0'))
  57. foundDigit = true
  58. of 'a'..'f':
  59. number = number shl 4 or (ord(s[i]) - ord('a') + 10)
  60. foundDigit = true
  61. of 'A'..'F':
  62. number = number shl 4 or (ord(s[i]) - ord('A') + 10)
  63. foundDigit = true
  64. else: break
  65. inc(i)
  66. if foundDigit: result = i-start
  67. proc parseOct*(s: string, number: var int, start = 0, maxLen = 0): int {.
  68. rtl, extern: "npuParseOct", noSideEffect.} =
  69. ## Parses an octal number and stores its value in ``number``. Returns
  70. ## the number of the parsed characters or 0 in case of an error.
  71. ##
  72. ## If ``maxLen == 0`` the length of the octal number has no upper bound.
  73. ## Else no more than ``start + maxLen`` characters are parsed, up to the
  74. ## length of the string.
  75. var i = start
  76. var foundDigit = false
  77. # get last index based on minimum `start + maxLen` or `s.len`
  78. let last = min(s.len, if maxLen == 0: s.len else: i+maxLen)
  79. if i+1 < last and s[i] == '0' and (s[i+1] in {'o', 'O'}): inc(i, 2)
  80. while i < last:
  81. case s[i]
  82. of '_': discard
  83. of '0'..'7':
  84. number = number shl 3 or (ord(s[i]) - ord('0'))
  85. foundDigit = true
  86. else: break
  87. inc(i)
  88. if foundDigit: result = i-start
  89. proc parseBin*(s: string, number: var int, start = 0, maxLen = 0): int {.
  90. rtl, extern: "npuParseBin", noSideEffect.} =
  91. ## Parses an binary number and stores its value in ``number``. Returns
  92. ## the number of the parsed characters or 0 in case of an error.
  93. ##
  94. ## If ``maxLen == 0`` the length of the binary number has no upper bound.
  95. ## Else no more than ``start + maxLen`` characters are parsed, up to the
  96. ## length of the string.
  97. var i = start
  98. var foundDigit = false
  99. # get last index based on minimum `start + maxLen` or `s.len`
  100. let last = min(s.len, if maxLen == 0: s.len else: i+maxLen)
  101. if i+1 < last and s[i] == '0' and (s[i+1] in {'b', 'B'}): inc(i, 2)
  102. while i < last:
  103. case s[i]
  104. of '_': discard
  105. of '0'..'1':
  106. number = number shl 1 or (ord(s[i]) - ord('0'))
  107. foundDigit = true
  108. else: break
  109. inc(i)
  110. if foundDigit: result = i-start
  111. proc parseIdent*(s: string, ident: var string, start = 0): int =
  112. ## parses an identifier and stores it in ``ident``. Returns
  113. ## the number of the parsed characters or 0 in case of an error.
  114. var i = start
  115. if i < s.len and s[i] in IdentStartChars:
  116. inc(i)
  117. while i < s.len and s[i] in IdentChars: inc(i)
  118. ident = substr(s, start, i-1)
  119. result = i-start
  120. proc parseIdent*(s: string, start = 0): string =
  121. ## parses an identifier and returns it or an empty string in
  122. ## case of an error.
  123. result = ""
  124. var i = start
  125. if i < s.len and s[i] in IdentStartChars:
  126. inc(i)
  127. while i < s.len and s[i] in IdentChars: inc(i)
  128. result = substr(s, start, i-1)
  129. proc parseToken*(s: string, token: var string, validChars: set[char],
  130. start = 0): int {.inline, deprecated.} =
  131. ## parses a token and stores it in ``token``. Returns
  132. ## the number of the parsed characters or 0 in case of an error. A token
  133. ## consists of the characters in `validChars`.
  134. ##
  135. ## **Deprecated since version 0.8.12**: Use ``parseWhile`` instead.
  136. var i = start
  137. while i < s.len and s[i] in validChars: inc(i)
  138. result = i-start
  139. token = substr(s, start, i-1)
  140. proc skipWhitespace*(s: string, start = 0): int {.inline.} =
  141. ## skips the whitespace starting at ``s[start]``. Returns the number of
  142. ## skipped characters.
  143. while start+result < s.len and s[start+result] in Whitespace: inc(result)
  144. proc skip*(s, token: string, start = 0): int {.inline.} =
  145. ## skips the `token` starting at ``s[start]``. Returns the length of `token`
  146. ## or 0 if there was no `token` at ``s[start]``.
  147. while start+result < s.len and result < token.len and
  148. s[result+start] == token[result]:
  149. inc(result)
  150. if result != token.len: result = 0
  151. proc skipIgnoreCase*(s, token: string, start = 0): int =
  152. ## same as `skip` but case is ignored for token matching.
  153. while start+result < s.len and result < token.len and
  154. toLower(s[result+start]) == toLower(token[result]): inc(result)
  155. if result != token.len: result = 0
  156. proc skipUntil*(s: string, until: set[char], start = 0): int {.inline.} =
  157. ## Skips all characters until one char from the set `until` is found
  158. ## or the end is reached.
  159. ## Returns number of characters skipped.
  160. while start+result < s.len and s[result+start] notin until: inc(result)
  161. proc skipUntil*(s: string, until: char, start = 0): int {.inline.} =
  162. ## Skips all characters until the char `until` is found
  163. ## or the end is reached.
  164. ## Returns number of characters skipped.
  165. while start+result < s.len and s[result+start] != until: inc(result)
  166. proc skipWhile*(s: string, toSkip: set[char], start = 0): int {.inline.} =
  167. ## Skips all characters while one char from the set `token` is found.
  168. ## Returns number of characters skipped.
  169. while start+result < s.len and s[result+start] in toSkip: inc(result)
  170. proc parseUntil*(s: string, token: var string, until: set[char],
  171. start = 0): int {.inline.} =
  172. ## parses a token and stores it in ``token``. Returns
  173. ## the number of the parsed characters or 0 in case of an error. A token
  174. ## consists of the characters notin `until`.
  175. var i = start
  176. while i < s.len and s[i] notin until: inc(i)
  177. result = i-start
  178. token = substr(s, start, i-1)
  179. proc parseUntil*(s: string, token: var string, until: char,
  180. start = 0): int {.inline.} =
  181. ## parses a token and stores it in ``token``. Returns
  182. ## the number of the parsed characters or 0 in case of an error. A token
  183. ## consists of any character that is not the `until` character.
  184. var i = start
  185. while i < s.len and s[i] != until: inc(i)
  186. result = i-start
  187. token = substr(s, start, i-1)
  188. proc parseUntil*(s: string, token: var string, until: string,
  189. start = 0): int {.inline.} =
  190. ## parses a token and stores it in ``token``. Returns
  191. ## the number of the parsed characters or 0 in case of an error. A token
  192. ## consists of any character that comes before the `until` token.
  193. if until.len == 0:
  194. token.setLen(0)
  195. return 0
  196. var i = start
  197. while i < s.len:
  198. if s[i] == until[0]:
  199. var u = 1
  200. while i+u < s.len and u < until.len and s[i+u] == until[u]:
  201. inc u
  202. if u >= until.len: break
  203. inc(i)
  204. result = i-start
  205. token = substr(s, start, i-1)
  206. proc parseWhile*(s: string, token: var string, validChars: set[char],
  207. start = 0): int {.inline.} =
  208. ## parses a token and stores it in ``token``. Returns
  209. ## the number of the parsed characters or 0 in case of an error. A token
  210. ## consists of the characters in `validChars`.
  211. var i = start
  212. while i < s.len and s[i] in validChars: inc(i)
  213. result = i-start
  214. token = substr(s, start, i-1)
  215. proc captureBetween*(s: string, first: char, second = '\0', start = 0): string =
  216. ## Finds the first occurrence of ``first``, then returns everything from there
  217. ## up to ``second`` (if ``second`` is '\0', then ``first`` is used).
  218. var i = skipUntil(s, first, start)+1+start
  219. result = ""
  220. discard s.parseUntil(result, if second == '\0': first else: second, i)
  221. {.push overflowChecks: on.}
  222. # this must be compiled with overflow checking turned on:
  223. proc rawParseInt(s: string, b: var BiggestInt, start = 0): int =
  224. var
  225. sign: BiggestInt = -1
  226. i = start
  227. if i < s.len:
  228. if s[i] == '+': inc(i)
  229. elif s[i] == '-':
  230. inc(i)
  231. sign = 1
  232. if i < s.len and s[i] in {'0'..'9'}:
  233. b = 0
  234. while i < s.len and s[i] in {'0'..'9'}:
  235. b = b * 10 - (ord(s[i]) - ord('0'))
  236. inc(i)
  237. while i < s.len and s[i] == '_': inc(i) # underscores are allowed and ignored
  238. b = b * sign
  239. result = i - start
  240. {.pop.} # overflowChecks
  241. proc parseBiggestInt*(s: string, number: var BiggestInt, start = 0): int {.
  242. rtl, extern: "npuParseBiggestInt", noSideEffect.} =
  243. ## parses an integer starting at `start` and stores the value into `number`.
  244. ## Result is the number of processed chars or 0 if there is no integer.
  245. ## `OverflowError` is raised if an overflow occurs.
  246. var res: BiggestInt
  247. # use 'res' for exception safety (don't write to 'number' in case of an
  248. # overflow exception):
  249. result = rawParseInt(s, res, start)
  250. number = res
  251. proc parseInt*(s: string, number: var int, start = 0): int {.
  252. rtl, extern: "npuParseInt", noSideEffect.} =
  253. ## parses an integer starting at `start` and stores the value into `number`.
  254. ## Result is the number of processed chars or 0 if there is no integer.
  255. ## `OverflowError` is raised if an overflow occurs.
  256. var res: BiggestInt
  257. result = parseBiggestInt(s, res, start)
  258. if (sizeof(int) <= 4) and
  259. ((res < low(int)) or (res > high(int))):
  260. raise newException(OverflowError, "overflow")
  261. elif result != 0:
  262. number = int(res)
  263. proc parseSaturatedNatural*(s: string, b: var int, start = 0): int =
  264. ## parses a natural number into ``b``. This cannot raise an overflow
  265. ## error. ``high(int)`` is returned for an overflow.
  266. ## The number of processed character is returned.
  267. ## This is usually what you really want to use instead of `parseInt`:idx:.
  268. ## Example:
  269. ##
  270. ## .. code-block:: nim
  271. ## var res = 0
  272. ## discard parseSaturatedNatural("848", res)
  273. ## doAssert res == 848
  274. var i = start
  275. if i < s.len and s[i] == '+': inc(i)
  276. if i < s.len and s[i] in {'0'..'9'}:
  277. b = 0
  278. while i < s.len and s[i] in {'0'..'9'}:
  279. let c = ord(s[i]) - ord('0')
  280. if b <= (high(int) - c) div 10:
  281. b = b * 10 + c
  282. else:
  283. b = high(int)
  284. inc(i)
  285. while i < s.len and s[i] == '_': inc(i) # underscores are allowed and ignored
  286. result = i - start
  287. # overflowChecks doesn't work with BiggestUInt
  288. proc rawParseUInt(s: string, b: var BiggestUInt, start = 0): int =
  289. var
  290. res = 0.BiggestUInt
  291. prev = 0.BiggestUInt
  292. i = start
  293. if i < s.len and s[i] == '+': inc(i) # Allow
  294. if i < s.len and s[i] in {'0'..'9'}:
  295. b = 0
  296. while i < s.len and s[i] in {'0'..'9'}:
  297. prev = res
  298. res = res * 10 + (ord(s[i]) - ord('0')).BiggestUInt
  299. if prev > res:
  300. return 0 # overflowChecks emulation
  301. inc(i)
  302. while i < s.len and s[i] == '_': inc(i) # underscores are allowed and ignored
  303. b = res
  304. result = i - start
  305. proc parseBiggestUInt*(s: string, number: var BiggestUInt, start = 0): int {.
  306. rtl, extern: "npuParseBiggestUInt", noSideEffect.} =
  307. ## parses an unsigned integer starting at `start` and stores the value
  308. ## into `number`.
  309. ## Result is the number of processed chars or 0 if there is no integer
  310. ## or overflow detected.
  311. var res: BiggestUInt
  312. # use 'res' for exception safety (don't write to 'number' in case of an
  313. # overflow exception):
  314. result = rawParseUInt(s, res, start)
  315. number = res
  316. proc parseUInt*(s: string, number: var uint, start = 0): int {.
  317. rtl, extern: "npuParseUInt", noSideEffect.} =
  318. ## parses an unsigned integer starting at `start` and stores the value
  319. ## into `number`.
  320. ## Result is the number of processed chars or 0 if there is no integer or
  321. ## overflow detected.
  322. var res: BiggestUInt
  323. result = parseBiggestUInt(s, res, start)
  324. when sizeof(BiggestUInt) > sizeof(uint) and sizeof(uint) <= 4:
  325. if res > 0xFFFF_FFFF'u64:
  326. raise newException(OverflowError, "overflow")
  327. if result != 0:
  328. number = uint(res)
  329. proc parseBiggestFloat*(s: string, number: var BiggestFloat, start = 0): int {.
  330. magic: "ParseBiggestFloat", importc: "nimParseBiggestFloat", noSideEffect.}
  331. ## parses a float starting at `start` and stores the value into `number`.
  332. ## Result is the number of processed chars or 0 if a parsing error
  333. ## occurred.
  334. proc parseFloat*(s: string, number: var float, start = 0): int {.
  335. rtl, extern: "npuParseFloat", noSideEffect.} =
  336. ## parses a float starting at `start` and stores the value into `number`.
  337. ## Result is the number of processed chars or 0 if there occurred a parsing
  338. ## error.
  339. var bf: BiggestFloat
  340. result = parseBiggestFloat(s, bf, start)
  341. if result != 0:
  342. number = bf
  343. type
  344. InterpolatedKind* = enum ## describes for `interpolatedFragments`
  345. ## which part of the interpolated string is
  346. ## yielded; for example in "str$$$var${expr}"
  347. ikStr, ## ``str`` part of the interpolated string
  348. ikDollar, ## escaped ``$`` part of the interpolated string
  349. ikVar, ## ``var`` part of the interpolated string
  350. ikExpr ## ``expr`` part of the interpolated string
  351. iterator interpolatedFragments*(s: string): tuple[kind: InterpolatedKind,
  352. value: string] =
  353. ## Tokenizes the string `s` into substrings for interpolation purposes.
  354. ##
  355. ## Example:
  356. ##
  357. ## .. code-block:: nim
  358. ## for k, v in interpolatedFragments(" $this is ${an example} $$"):
  359. ## echo "(", k, ", \"", v, "\")"
  360. ##
  361. ## Results in:
  362. ##
  363. ## .. code-block:: nim
  364. ## (ikString, " ")
  365. ## (ikExpr, "this")
  366. ## (ikString, " is ")
  367. ## (ikExpr, "an example")
  368. ## (ikString, " ")
  369. ## (ikDollar, "$")
  370. var i = 0
  371. var kind: InterpolatedKind
  372. while true:
  373. var j = i
  374. if j < s.len and s[j] == '$':
  375. if j+1 < s.len and s[j+1] == '{':
  376. inc j, 2
  377. var nesting = 0
  378. block curlies:
  379. while j < s.len:
  380. case s[j]
  381. of '{': inc nesting
  382. of '}':
  383. if nesting == 0:
  384. inc j
  385. break curlies
  386. dec nesting
  387. else: discard
  388. inc j
  389. raise newException(ValueError,
  390. "Expected closing '}': " & substr(s, i, s.high))
  391. inc i, 2 # skip ${
  392. kind = ikExpr
  393. elif j+1 < s.len and s[j+1] in IdentStartChars:
  394. inc j, 2
  395. while j < s.len and s[j] in IdentChars: inc(j)
  396. inc i # skip $
  397. kind = ikVar
  398. elif j+1 < s.len and s[j+1] == '$':
  399. inc j, 2
  400. inc i # skip $
  401. kind = ikDollar
  402. else:
  403. raise newException(ValueError,
  404. "Unable to parse a varible name at " & substr(s, i, s.high))
  405. else:
  406. while j < s.len and s[j] != '$': inc j
  407. kind = ikStr
  408. if j > i:
  409. # do not copy the trailing } for ikExpr:
  410. yield (kind, substr(s, i, j-1-ord(kind == ikExpr)))
  411. else:
  412. break
  413. i = j
  414. when isMainModule:
  415. import sequtils
  416. let input = "$test{} $this is ${an{ example}} "
  417. let expected = @[(ikVar, "test"), (ikStr, "{} "), (ikVar, "this"),
  418. (ikStr, " is "), (ikExpr, "an{ example}"), (ikStr, " ")]
  419. doAssert toSeq(interpolatedFragments(input)) == expected
  420. var value = 0
  421. discard parseHex("0x38", value)
  422. doAssert value == 56
  423. discard parseHex("0x34", value)
  424. doAssert value == 56 * 256 + 52
  425. value = -1
  426. discard parseHex("0x38", value)
  427. doAssert value == -200
  428. value = -1
  429. doAssert(parseSaturatedNatural("848", value) == 3)
  430. doAssert value == 848
  431. value = -1
  432. discard parseSaturatedNatural("84899999999999999999324234243143142342135435342532453", value)
  433. doAssert value == high(int)
  434. value = -1
  435. discard parseSaturatedNatural("9223372036854775808", value)
  436. doAssert value == high(int)
  437. value = -1
  438. discard parseSaturatedNatural("9223372036854775807", value)
  439. doAssert value == high(int)
  440. value = -1
  441. discard parseSaturatedNatural("18446744073709551616", value)
  442. doAssert value == high(int)
  443. value = -1
  444. discard parseSaturatedNatural("18446744073709551615", value)
  445. doAssert value == high(int)
  446. value = -1
  447. doAssert(parseSaturatedNatural("1_000_000", value) == 9)
  448. doAssert value == 1_000_000
  449. {.pop.}