parsejson.nim 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2018 Nim contributors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a json parser. It is used
  10. ## and exported by the ``json`` standard library
  11. ## module, but can also be used in its own right.
  12. import
  13. strutils, lexbase, streams, unicode
  14. type
  15. JsonEventKind* = enum ## enumeration of all events that may occur when parsing
  16. jsonError, ## an error occurred during parsing
  17. jsonEof, ## end of file reached
  18. jsonString, ## a string literal
  19. jsonInt, ## an integer literal
  20. jsonFloat, ## a float literal
  21. jsonTrue, ## the value ``true``
  22. jsonFalse, ## the value ``false``
  23. jsonNull, ## the value ``null``
  24. jsonObjectStart, ## start of an object: the ``{`` token
  25. jsonObjectEnd, ## end of an object: the ``}`` token
  26. jsonArrayStart, ## start of an array: the ``[`` token
  27. jsonArrayEnd ## end of an array: the ``]`` token
  28. TokKind* = enum # must be synchronized with TJsonEventKind!
  29. tkError,
  30. tkEof,
  31. tkString,
  32. tkInt,
  33. tkFloat,
  34. tkTrue,
  35. tkFalse,
  36. tkNull,
  37. tkCurlyLe,
  38. tkCurlyRi,
  39. tkBracketLe,
  40. tkBracketRi,
  41. tkColon,
  42. tkComma
  43. JsonError* = enum ## enumeration that lists all errors that can occur
  44. errNone, ## no error
  45. errInvalidToken, ## invalid token
  46. errStringExpected, ## string expected
  47. errColonExpected, ## ``:`` expected
  48. errCommaExpected, ## ``,`` expected
  49. errBracketRiExpected, ## ``]`` expected
  50. errCurlyRiExpected, ## ``}`` expected
  51. errQuoteExpected, ## ``"`` or ``'`` expected
  52. errEOC_Expected, ## ``*/`` expected
  53. errEofExpected, ## EOF expected
  54. errExprExpected ## expr expected
  55. ParserState = enum
  56. stateEof, stateStart, stateObject, stateArray, stateExpectArrayComma,
  57. stateExpectObjectComma, stateExpectColon, stateExpectValue
  58. JsonParser* = object of BaseLexer ## the parser object.
  59. a*: string
  60. tok*: TokKind
  61. kind: JsonEventKind
  62. err: JsonError
  63. state: seq[ParserState]
  64. filename: string
  65. rawStringLiterals: bool
  66. JsonKindError* = object of ValueError ## raised by the ``to`` macro if the
  67. ## JSON kind is incorrect.
  68. JsonParsingError* = object of ValueError ## is raised for a JSON error
  69. const
  70. errorMessages*: array[JsonError, string] = [
  71. "no error",
  72. "invalid token",
  73. "string expected",
  74. "':' expected",
  75. "',' expected",
  76. "']' expected",
  77. "'}' expected",
  78. "'\"' or \"'\" expected",
  79. "'*/' expected",
  80. "EOF expected",
  81. "expression expected"
  82. ]
  83. tokToStr: array[TokKind, string] = [
  84. "invalid token",
  85. "EOF",
  86. "string literal",
  87. "int literal",
  88. "float literal",
  89. "true",
  90. "false",
  91. "null",
  92. "{", "}", "[", "]", ":", ","
  93. ]
  94. proc open*(my: var JsonParser, input: Stream, filename: string;
  95. rawStringLiterals = false) =
  96. ## initializes the parser with an input stream. `Filename` is only used
  97. ## for nice error messages. If `rawStringLiterals` is true, string literals
  98. ## are kept with their surrounding quotes and escape sequences in them are
  99. ## left untouched too.
  100. lexbase.open(my, input)
  101. my.filename = filename
  102. my.state = @[stateStart]
  103. my.kind = jsonError
  104. my.a = ""
  105. my.rawStringLiterals = rawStringLiterals
  106. proc close*(my: var JsonParser) {.inline.} =
  107. ## closes the parser `my` and its associated input stream.
  108. lexbase.close(my)
  109. proc str*(my: JsonParser): string {.inline.} =
  110. ## returns the character data for the events: ``jsonInt``, ``jsonFloat``,
  111. ## ``jsonString``
  112. assert(my.kind in {jsonInt, jsonFloat, jsonString})
  113. return my.a
  114. proc getInt*(my: JsonParser): BiggestInt {.inline.} =
  115. ## returns the number for the event: ``jsonInt``
  116. assert(my.kind == jsonInt)
  117. return parseBiggestInt(my.a)
  118. proc getFloat*(my: JsonParser): float {.inline.} =
  119. ## returns the number for the event: ``jsonFloat``
  120. assert(my.kind == jsonFloat)
  121. return parseFloat(my.a)
  122. proc kind*(my: JsonParser): JsonEventKind {.inline.} =
  123. ## returns the current event type for the JSON parser
  124. return my.kind
  125. proc getColumn*(my: JsonParser): int {.inline.} =
  126. ## get the current column the parser has arrived at.
  127. result = getColNumber(my, my.bufpos)
  128. proc getLine*(my: JsonParser): int {.inline.} =
  129. ## get the current line the parser has arrived at.
  130. result = my.lineNumber
  131. proc getFilename*(my: JsonParser): string {.inline.} =
  132. ## get the filename of the file that the parser processes.
  133. result = my.filename
  134. proc errorMsg*(my: JsonParser): string =
  135. ## returns a helpful error message for the event ``jsonError``
  136. assert(my.kind == jsonError)
  137. result = "$1($2, $3) Error: $4" % [
  138. my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
  139. proc errorMsgExpected*(my: JsonParser, e: string): string =
  140. ## returns an error message "`e` expected" in the same format as the
  141. ## other error messages
  142. result = "$1($2, $3) Error: $4" % [
  143. my.filename, $getLine(my), $getColumn(my), e & " expected"]
  144. proc handleHexChar(c: char, x: var int): bool =
  145. result = true # Success
  146. case c
  147. of '0'..'9': x = (x shl 4) or (ord(c) - ord('0'))
  148. of 'a'..'f': x = (x shl 4) or (ord(c) - ord('a') + 10)
  149. of 'A'..'F': x = (x shl 4) or (ord(c) - ord('A') + 10)
  150. else: result = false # error
  151. proc parseEscapedUTF16*(buf: cstring, pos: var int): int =
  152. result = 0
  153. #UTF-16 escape is always 4 bytes.
  154. for _ in 0..3:
  155. if handleHexChar(buf[pos], result):
  156. inc(pos)
  157. else:
  158. return -1
  159. proc parseString(my: var JsonParser): TokKind =
  160. result = tkString
  161. var pos = my.bufpos + 1
  162. if my.rawStringLiterals:
  163. add(my.a, '"')
  164. while true:
  165. case my.buf[pos]
  166. of '\0':
  167. my.err = errQuoteExpected
  168. result = tkError
  169. break
  170. of '"':
  171. if my.rawStringLiterals:
  172. add(my.a, '"')
  173. inc(pos)
  174. break
  175. of '\\':
  176. if my.rawStringLiterals:
  177. add(my.a, '\\')
  178. case my.buf[pos+1]
  179. of '\\', '"', '\'', '/':
  180. add(my.a, my.buf[pos+1])
  181. inc(pos, 2)
  182. of 'b':
  183. add(my.a, '\b')
  184. inc(pos, 2)
  185. of 'f':
  186. add(my.a, '\f')
  187. inc(pos, 2)
  188. of 'n':
  189. add(my.a, '\L')
  190. inc(pos, 2)
  191. of 'r':
  192. add(my.a, '\C')
  193. inc(pos, 2)
  194. of 't':
  195. add(my.a, '\t')
  196. inc(pos, 2)
  197. of 'v':
  198. add(my.a, '\v')
  199. inc(pos, 2)
  200. of 'u':
  201. if my.rawStringLiterals:
  202. add(my.a, 'u')
  203. inc(pos, 2)
  204. var pos2 = pos
  205. var r = parseEscapedUTF16(my.buf, pos)
  206. if r < 0:
  207. my.err = errInvalidToken
  208. break
  209. # Deal with surrogates
  210. if (r and 0xfc00) == 0xd800:
  211. if my.buf[pos] != '\\' or my.buf[pos+1] != 'u':
  212. my.err = errInvalidToken
  213. break
  214. inc(pos, 2)
  215. var s = parseEscapedUTF16(my.buf, pos)
  216. if (s and 0xfc00) == 0xdc00 and s > 0:
  217. r = 0x10000 + (((r - 0xd800) shl 10) or (s - 0xdc00))
  218. else:
  219. my.err = errInvalidToken
  220. break
  221. if my.rawStringLiterals:
  222. let length = pos - pos2
  223. for i in 1 .. length:
  224. if my.buf[pos2] in {'0'..'9', 'A'..'F', 'a'..'f'}:
  225. add(my.a, my.buf[pos2])
  226. inc pos2
  227. else:
  228. break
  229. else:
  230. add(my.a, toUTF8(Rune(r)))
  231. else:
  232. # don't bother with the error
  233. add(my.a, my.buf[pos])
  234. inc(pos)
  235. of '\c':
  236. pos = lexbase.handleCR(my, pos)
  237. add(my.a, '\c')
  238. of '\L':
  239. pos = lexbase.handleLF(my, pos)
  240. add(my.a, '\L')
  241. else:
  242. add(my.a, my.buf[pos])
  243. inc(pos)
  244. my.bufpos = pos # store back
  245. proc skip(my: var JsonParser) =
  246. var pos = my.bufpos
  247. while true:
  248. case my.buf[pos]
  249. of '/':
  250. if my.buf[pos+1] == '/':
  251. # skip line comment:
  252. inc(pos, 2)
  253. while true:
  254. case my.buf[pos]
  255. of '\0':
  256. break
  257. of '\c':
  258. pos = lexbase.handleCR(my, pos)
  259. break
  260. of '\L':
  261. pos = lexbase.handleLF(my, pos)
  262. break
  263. else:
  264. inc(pos)
  265. elif my.buf[pos+1] == '*':
  266. # skip long comment:
  267. inc(pos, 2)
  268. while true:
  269. case my.buf[pos]
  270. of '\0':
  271. my.err = errEOC_Expected
  272. break
  273. of '\c':
  274. pos = lexbase.handleCR(my, pos)
  275. of '\L':
  276. pos = lexbase.handleLF(my, pos)
  277. of '*':
  278. inc(pos)
  279. if my.buf[pos] == '/':
  280. inc(pos)
  281. break
  282. else:
  283. inc(pos)
  284. else:
  285. break
  286. of ' ', '\t':
  287. inc(pos)
  288. of '\c':
  289. pos = lexbase.handleCR(my, pos)
  290. of '\L':
  291. pos = lexbase.handleLF(my, pos)
  292. else:
  293. break
  294. my.bufpos = pos
  295. proc parseNumber(my: var JsonParser) =
  296. var pos = my.bufpos
  297. if my.buf[pos] == '-':
  298. add(my.a, '-')
  299. inc(pos)
  300. if my.buf[pos] == '.':
  301. add(my.a, "0.")
  302. inc(pos)
  303. else:
  304. while my.buf[pos] in Digits:
  305. add(my.a, my.buf[pos])
  306. inc(pos)
  307. if my.buf[pos] == '.':
  308. add(my.a, '.')
  309. inc(pos)
  310. # digits after the dot:
  311. while my.buf[pos] in Digits:
  312. add(my.a, my.buf[pos])
  313. inc(pos)
  314. if my.buf[pos] in {'E', 'e'}:
  315. add(my.a, my.buf[pos])
  316. inc(pos)
  317. if my.buf[pos] in {'+', '-'}:
  318. add(my.a, my.buf[pos])
  319. inc(pos)
  320. while my.buf[pos] in Digits:
  321. add(my.a, my.buf[pos])
  322. inc(pos)
  323. my.bufpos = pos
  324. proc parseName(my: var JsonParser) =
  325. var pos = my.bufpos
  326. if my.buf[pos] in IdentStartChars:
  327. while my.buf[pos] in IdentChars:
  328. add(my.a, my.buf[pos])
  329. inc(pos)
  330. my.bufpos = pos
  331. proc getTok*(my: var JsonParser): TokKind =
  332. setLen(my.a, 0)
  333. skip(my) # skip whitespace, comments
  334. case my.buf[my.bufpos]
  335. of '-', '.', '0'..'9':
  336. parseNumber(my)
  337. if {'.', 'e', 'E'} in my.a:
  338. result = tkFloat
  339. else:
  340. result = tkInt
  341. of '"':
  342. result = parseString(my)
  343. of '[':
  344. inc(my.bufpos)
  345. result = tkBracketLe
  346. of '{':
  347. inc(my.bufpos)
  348. result = tkCurlyLe
  349. of ']':
  350. inc(my.bufpos)
  351. result = tkBracketRi
  352. of '}':
  353. inc(my.bufpos)
  354. result = tkCurlyRi
  355. of ',':
  356. inc(my.bufpos)
  357. result = tkComma
  358. of ':':
  359. inc(my.bufpos)
  360. result = tkColon
  361. of '\0':
  362. result = tkEof
  363. of 'a'..'z', 'A'..'Z', '_':
  364. parseName(my)
  365. case my.a
  366. of "null": result = tkNull
  367. of "true": result = tkTrue
  368. of "false": result = tkFalse
  369. else: result = tkError
  370. else:
  371. inc(my.bufpos)
  372. result = tkError
  373. my.tok = result
  374. proc next*(my: var JsonParser) =
  375. ## retrieves the first/next event. This controls the parser.
  376. var tk = getTok(my)
  377. var i = my.state.len-1
  378. # the following code is a state machine. If we had proper coroutines,
  379. # the code could be much simpler.
  380. case my.state[i]
  381. of stateEof:
  382. if tk == tkEof:
  383. my.kind = jsonEof
  384. else:
  385. my.kind = jsonError
  386. my.err = errEofExpected
  387. of stateStart:
  388. # tokens allowed?
  389. case tk
  390. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  391. my.state[i] = stateEof # expect EOF next!
  392. my.kind = JsonEventKind(ord(tk))
  393. of tkBracketLe:
  394. my.state.add(stateArray) # we expect any
  395. my.kind = jsonArrayStart
  396. of tkCurlyLe:
  397. my.state.add(stateObject)
  398. my.kind = jsonObjectStart
  399. of tkEof:
  400. my.kind = jsonEof
  401. else:
  402. my.kind = jsonError
  403. my.err = errEofExpected
  404. of stateObject:
  405. case tk
  406. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  407. my.state.add(stateExpectColon)
  408. my.kind = JsonEventKind(ord(tk))
  409. of tkBracketLe:
  410. my.state.add(stateExpectColon)
  411. my.state.add(stateArray)
  412. my.kind = jsonArrayStart
  413. of tkCurlyLe:
  414. my.state.add(stateExpectColon)
  415. my.state.add(stateObject)
  416. my.kind = jsonObjectStart
  417. of tkCurlyRi:
  418. my.kind = jsonObjectEnd
  419. discard my.state.pop()
  420. else:
  421. my.kind = jsonError
  422. my.err = errCurlyRiExpected
  423. of stateArray:
  424. case tk
  425. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  426. my.state.add(stateExpectArrayComma) # expect value next!
  427. my.kind = JsonEventKind(ord(tk))
  428. of tkBracketLe:
  429. my.state.add(stateExpectArrayComma)
  430. my.state.add(stateArray)
  431. my.kind = jsonArrayStart
  432. of tkCurlyLe:
  433. my.state.add(stateExpectArrayComma)
  434. my.state.add(stateObject)
  435. my.kind = jsonObjectStart
  436. of tkBracketRi:
  437. my.kind = jsonArrayEnd
  438. discard my.state.pop()
  439. else:
  440. my.kind = jsonError
  441. my.err = errBracketRiExpected
  442. of stateExpectArrayComma:
  443. case tk
  444. of tkComma:
  445. discard my.state.pop()
  446. next(my)
  447. of tkBracketRi:
  448. my.kind = jsonArrayEnd
  449. discard my.state.pop() # pop stateExpectArrayComma
  450. discard my.state.pop() # pop stateArray
  451. else:
  452. my.kind = jsonError
  453. my.err = errBracketRiExpected
  454. of stateExpectObjectComma:
  455. case tk
  456. of tkComma:
  457. discard my.state.pop()
  458. next(my)
  459. of tkCurlyRi:
  460. my.kind = jsonObjectEnd
  461. discard my.state.pop() # pop stateExpectObjectComma
  462. discard my.state.pop() # pop stateObject
  463. else:
  464. my.kind = jsonError
  465. my.err = errCurlyRiExpected
  466. of stateExpectColon:
  467. case tk
  468. of tkColon:
  469. my.state[i] = stateExpectValue
  470. next(my)
  471. else:
  472. my.kind = jsonError
  473. my.err = errColonExpected
  474. of stateExpectValue:
  475. case tk
  476. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  477. my.state[i] = stateExpectObjectComma
  478. my.kind = JsonEventKind(ord(tk))
  479. of tkBracketLe:
  480. my.state[i] = stateExpectObjectComma
  481. my.state.add(stateArray)
  482. my.kind = jsonArrayStart
  483. of tkCurlyLe:
  484. my.state[i] = stateExpectObjectComma
  485. my.state.add(stateObject)
  486. my.kind = jsonObjectStart
  487. else:
  488. my.kind = jsonError
  489. my.err = errExprExpected
  490. proc raiseParseErr*(p: JsonParser, msg: string) {.noinline, noreturn.} =
  491. ## raises an `EJsonParsingError` exception.
  492. raise newException(JsonParsingError, errorMsgExpected(p, msg))
  493. proc eat*(p: var JsonParser, tok: TokKind) =
  494. if p.tok == tok: discard getTok(p)
  495. else: raiseParseErr(p, tokToStr[tok])