parsejson.nim 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2018 Nim contributors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a json parser. It is used
  10. ## and exported by the ``json`` standard library
  11. ## module, but can also be used in its own right.
  12. import
  13. strutils, lexbase, streams, unicode
  14. type
  15. JsonEventKind* = enum ## enumeration of all events that may occur when parsing
  16. jsonError, ## an error occurred during parsing
  17. jsonEof, ## end of file reached
  18. jsonString, ## a string literal
  19. jsonInt, ## an integer literal
  20. jsonFloat, ## a float literal
  21. jsonTrue, ## the value ``true``
  22. jsonFalse, ## the value ``false``
  23. jsonNull, ## the value ``null``
  24. jsonObjectStart, ## start of an object: the ``{`` token
  25. jsonObjectEnd, ## end of an object: the ``}`` token
  26. jsonArrayStart, ## start of an array: the ``[`` token
  27. jsonArrayEnd ## start of an array: the ``]`` token
  28. TokKind* = enum # must be synchronized with TJsonEventKind!
  29. tkError,
  30. tkEof,
  31. tkString,
  32. tkInt,
  33. tkFloat,
  34. tkTrue,
  35. tkFalse,
  36. tkNull,
  37. tkCurlyLe,
  38. tkCurlyRi,
  39. tkBracketLe,
  40. tkBracketRi,
  41. tkColon,
  42. tkComma
  43. JsonError* = enum ## enumeration that lists all errors that can occur
  44. errNone, ## no error
  45. errInvalidToken, ## invalid token
  46. errStringExpected, ## string expected
  47. errColonExpected, ## ``:`` expected
  48. errCommaExpected, ## ``,`` expected
  49. errBracketRiExpected, ## ``]`` expected
  50. errCurlyRiExpected, ## ``}`` expected
  51. errQuoteExpected, ## ``"`` or ``'`` expected
  52. errEOC_Expected, ## ``*/`` expected
  53. errEofExpected, ## EOF expected
  54. errExprExpected ## expr expected
  55. ParserState = enum
  56. stateEof, stateStart, stateObject, stateArray, stateExpectArrayComma,
  57. stateExpectObjectComma, stateExpectColon, stateExpectValue
  58. JsonParser* = object of BaseLexer ## the parser object.
  59. a*: string
  60. tok*: TokKind
  61. kind: JsonEventKind
  62. err: JsonError
  63. state: seq[ParserState]
  64. filename: string
  65. rawStringLiterals: bool
  66. JsonKindError* = object of ValueError ## raised by the ``to`` macro if the
  67. ## JSON kind is incorrect.
  68. JsonParsingError* = object of ValueError ## is raised for a JSON error
  69. const
  70. errorMessages*: array[JsonError, string] = [
  71. "no error",
  72. "invalid token",
  73. "string expected",
  74. "':' expected",
  75. "',' expected",
  76. "']' expected",
  77. "'}' expected",
  78. "'\"' or \"'\" expected",
  79. "'*/' expected",
  80. "EOF expected",
  81. "expression expected"
  82. ]
  83. tokToStr: array[TokKind, string] = [
  84. "invalid token",
  85. "EOF",
  86. "string literal",
  87. "int literal",
  88. "float literal",
  89. "true",
  90. "false",
  91. "null",
  92. "{", "}", "[", "]", ":", ","
  93. ]
  94. proc open*(my: var JsonParser, input: Stream, filename: string;
  95. rawStringLiterals = false) =
  96. ## initializes the parser with an input stream. `Filename` is only used
  97. ## for nice error messages. If `rawStringLiterals` is true, string literals
  98. ## are kepts with their surrounding quotes and escape sequences in them are
  99. ## left untouched too.
  100. lexbase.open(my, input)
  101. my.filename = filename
  102. my.state = @[stateStart]
  103. my.kind = jsonError
  104. my.a = ""
  105. my.rawStringLiterals = rawStringLiterals
  106. proc close*(my: var JsonParser) {.inline.} =
  107. ## closes the parser `my` and its associated input stream.
  108. lexbase.close(my)
  109. proc str*(my: JsonParser): string {.inline.} =
  110. ## returns the character data for the events: ``jsonInt``, ``jsonFloat``,
  111. ## ``jsonString``
  112. assert(my.kind in {jsonInt, jsonFloat, jsonString})
  113. return my.a
  114. proc getInt*(my: JsonParser): BiggestInt {.inline.} =
  115. ## returns the number for the event: ``jsonInt``
  116. assert(my.kind == jsonInt)
  117. return parseBiggestInt(my.a)
  118. proc getFloat*(my: JsonParser): float {.inline.} =
  119. ## returns the number for the event: ``jsonFloat``
  120. assert(my.kind == jsonFloat)
  121. return parseFloat(my.a)
  122. proc kind*(my: JsonParser): JsonEventKind {.inline.} =
  123. ## returns the current event type for the JSON parser
  124. return my.kind
  125. proc getColumn*(my: JsonParser): int {.inline.} =
  126. ## get the current column the parser has arrived at.
  127. result = getColNumber(my, my.bufpos)
  128. proc getLine*(my: JsonParser): int {.inline.} =
  129. ## get the current line the parser has arrived at.
  130. result = my.lineNumber
  131. proc getFilename*(my: JsonParser): string {.inline.} =
  132. ## get the filename of the file that the parser processes.
  133. result = my.filename
  134. proc errorMsg*(my: JsonParser): string =
  135. ## returns a helpful error message for the event ``jsonError``
  136. assert(my.kind == jsonError)
  137. result = "$1($2, $3) Error: $4" % [
  138. my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
  139. proc errorMsgExpected*(my: JsonParser, e: string): string =
  140. ## returns an error message "`e` expected" in the same format as the
  141. ## other error messages
  142. result = "$1($2, $3) Error: $4" % [
  143. my.filename, $getLine(my), $getColumn(my), e & " expected"]
  144. proc handleHexChar(c: char, x: var int): bool =
  145. result = true # Success
  146. case c
  147. of '0'..'9': x = (x shl 4) or (ord(c) - ord('0'))
  148. of 'a'..'f': x = (x shl 4) or (ord(c) - ord('a') + 10)
  149. of 'A'..'F': x = (x shl 4) or (ord(c) - ord('A') + 10)
  150. else: result = false # error
  151. proc parseEscapedUTF16*(buf: cstring, pos: var int): int =
  152. result = 0
  153. #UTF-16 escape is always 4 bytes.
  154. for _ in 0..3:
  155. if handleHexChar(buf[pos], result):
  156. inc(pos)
  157. else:
  158. return -1
  159. proc parseString(my: var JsonParser): TokKind =
  160. result = tkString
  161. var pos = my.bufpos + 1
  162. var buf = my.buf
  163. if my.rawStringLiterals:
  164. add(my.a, '"')
  165. while true:
  166. case buf[pos]
  167. of '\0':
  168. my.err = errQuoteExpected
  169. result = tkError
  170. break
  171. of '"':
  172. if my.rawStringLiterals:
  173. add(my.a, '"')
  174. inc(pos)
  175. break
  176. of '\\':
  177. if my.rawStringLiterals:
  178. add(my.a, '\\')
  179. case buf[pos+1]
  180. of '\\', '"', '\'', '/':
  181. add(my.a, buf[pos+1])
  182. inc(pos, 2)
  183. of 'b':
  184. add(my.a, '\b')
  185. inc(pos, 2)
  186. of 'f':
  187. add(my.a, '\f')
  188. inc(pos, 2)
  189. of 'n':
  190. add(my.a, '\L')
  191. inc(pos, 2)
  192. of 'r':
  193. add(my.a, '\C')
  194. inc(pos, 2)
  195. of 't':
  196. add(my.a, '\t')
  197. inc(pos, 2)
  198. of 'u':
  199. if my.rawStringLiterals:
  200. add(my.a, 'u')
  201. inc(pos, 2)
  202. var pos2 = pos
  203. var r = parseEscapedUTF16(buf, pos)
  204. if r < 0:
  205. my.err = errInvalidToken
  206. break
  207. # Deal with surrogates
  208. if (r and 0xfc00) == 0xd800:
  209. if buf[pos] != '\\' or buf[pos+1] != 'u':
  210. my.err = errInvalidToken
  211. break
  212. inc(pos, 2)
  213. var s = parseEscapedUTF16(buf, pos)
  214. if (s and 0xfc00) == 0xdc00 and s > 0:
  215. r = 0x10000 + (((r - 0xd800) shl 10) or (s - 0xdc00))
  216. else:
  217. my.err = errInvalidToken
  218. break
  219. if my.rawStringLiterals:
  220. let length = pos - pos2
  221. for i in 1 .. length:
  222. if buf[pos2] in {'0'..'9', 'A'..'F', 'a'..'f'}:
  223. add(my.a, buf[pos2])
  224. inc pos2
  225. else:
  226. break
  227. else:
  228. add(my.a, toUTF8(Rune(r)))
  229. else:
  230. # don't bother with the error
  231. add(my.a, buf[pos])
  232. inc(pos)
  233. of '\c':
  234. pos = lexbase.handleCR(my, pos)
  235. buf = my.buf
  236. add(my.a, '\c')
  237. of '\L':
  238. pos = lexbase.handleLF(my, pos)
  239. buf = my.buf
  240. add(my.a, '\L')
  241. else:
  242. add(my.a, buf[pos])
  243. inc(pos)
  244. my.bufpos = pos # store back
  245. proc skip(my: var JsonParser) =
  246. var pos = my.bufpos
  247. var buf = my.buf
  248. while true:
  249. case buf[pos]
  250. of '/':
  251. if buf[pos+1] == '/':
  252. # skip line comment:
  253. inc(pos, 2)
  254. while true:
  255. case buf[pos]
  256. of '\0':
  257. break
  258. of '\c':
  259. pos = lexbase.handleCR(my, pos)
  260. buf = my.buf
  261. break
  262. of '\L':
  263. pos = lexbase.handleLF(my, pos)
  264. buf = my.buf
  265. break
  266. else:
  267. inc(pos)
  268. elif buf[pos+1] == '*':
  269. # skip long comment:
  270. inc(pos, 2)
  271. while true:
  272. case buf[pos]
  273. of '\0':
  274. my.err = errEOC_Expected
  275. break
  276. of '\c':
  277. pos = lexbase.handleCR(my, pos)
  278. buf = my.buf
  279. of '\L':
  280. pos = lexbase.handleLF(my, pos)
  281. buf = my.buf
  282. of '*':
  283. inc(pos)
  284. if buf[pos] == '/':
  285. inc(pos)
  286. break
  287. else:
  288. inc(pos)
  289. else:
  290. break
  291. of ' ', '\t':
  292. inc(pos)
  293. of '\c':
  294. pos = lexbase.handleCR(my, pos)
  295. buf = my.buf
  296. of '\L':
  297. pos = lexbase.handleLF(my, pos)
  298. buf = my.buf
  299. else:
  300. break
  301. my.bufpos = pos
  302. proc parseNumber(my: var JsonParser) =
  303. var pos = my.bufpos
  304. var buf = my.buf
  305. if buf[pos] == '-':
  306. add(my.a, '-')
  307. inc(pos)
  308. if buf[pos] == '.':
  309. add(my.a, "0.")
  310. inc(pos)
  311. else:
  312. while buf[pos] in Digits:
  313. add(my.a, buf[pos])
  314. inc(pos)
  315. if buf[pos] == '.':
  316. add(my.a, '.')
  317. inc(pos)
  318. # digits after the dot:
  319. while buf[pos] in Digits:
  320. add(my.a, buf[pos])
  321. inc(pos)
  322. if buf[pos] in {'E', 'e'}:
  323. add(my.a, buf[pos])
  324. inc(pos)
  325. if buf[pos] in {'+', '-'}:
  326. add(my.a, buf[pos])
  327. inc(pos)
  328. while buf[pos] in Digits:
  329. add(my.a, buf[pos])
  330. inc(pos)
  331. my.bufpos = pos
  332. proc parseName(my: var JsonParser) =
  333. var pos = my.bufpos
  334. var buf = my.buf
  335. if buf[pos] in IdentStartChars:
  336. while buf[pos] in IdentChars:
  337. add(my.a, buf[pos])
  338. inc(pos)
  339. my.bufpos = pos
  340. proc getTok*(my: var JsonParser): TokKind =
  341. setLen(my.a, 0)
  342. skip(my) # skip whitespace, comments
  343. case my.buf[my.bufpos]
  344. of '-', '.', '0'..'9':
  345. parseNumber(my)
  346. if {'.', 'e', 'E'} in my.a:
  347. result = tkFloat
  348. else:
  349. result = tkInt
  350. of '"':
  351. result = parseString(my)
  352. of '[':
  353. inc(my.bufpos)
  354. result = tkBracketLe
  355. of '{':
  356. inc(my.bufpos)
  357. result = tkCurlyLe
  358. of ']':
  359. inc(my.bufpos)
  360. result = tkBracketRi
  361. of '}':
  362. inc(my.bufpos)
  363. result = tkCurlyRi
  364. of ',':
  365. inc(my.bufpos)
  366. result = tkComma
  367. of ':':
  368. inc(my.bufpos)
  369. result = tkColon
  370. of '\0':
  371. result = tkEof
  372. of 'a'..'z', 'A'..'Z', '_':
  373. parseName(my)
  374. case my.a
  375. of "null": result = tkNull
  376. of "true": result = tkTrue
  377. of "false": result = tkFalse
  378. else: result = tkError
  379. else:
  380. inc(my.bufpos)
  381. result = tkError
  382. my.tok = result
  383. proc next*(my: var JsonParser) =
  384. ## retrieves the first/next event. This controls the parser.
  385. var tk = getTok(my)
  386. var i = my.state.len-1
  387. # the following code is a state machine. If we had proper coroutines,
  388. # the code could be much simpler.
  389. case my.state[i]
  390. of stateEof:
  391. if tk == tkEof:
  392. my.kind = jsonEof
  393. else:
  394. my.kind = jsonError
  395. my.err = errEofExpected
  396. of stateStart:
  397. # tokens allowed?
  398. case tk
  399. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  400. my.state[i] = stateEof # expect EOF next!
  401. my.kind = JsonEventKind(ord(tk))
  402. of tkBracketLe:
  403. my.state.add(stateArray) # we expect any
  404. my.kind = jsonArrayStart
  405. of tkCurlyLe:
  406. my.state.add(stateObject)
  407. my.kind = jsonObjectStart
  408. of tkEof:
  409. my.kind = jsonEof
  410. else:
  411. my.kind = jsonError
  412. my.err = errEofExpected
  413. of stateObject:
  414. case tk
  415. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  416. my.state.add(stateExpectColon)
  417. my.kind = JsonEventKind(ord(tk))
  418. of tkBracketLe:
  419. my.state.add(stateExpectColon)
  420. my.state.add(stateArray)
  421. my.kind = jsonArrayStart
  422. of tkCurlyLe:
  423. my.state.add(stateExpectColon)
  424. my.state.add(stateObject)
  425. my.kind = jsonObjectStart
  426. of tkCurlyRi:
  427. my.kind = jsonObjectEnd
  428. discard my.state.pop()
  429. else:
  430. my.kind = jsonError
  431. my.err = errCurlyRiExpected
  432. of stateArray:
  433. case tk
  434. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  435. my.state.add(stateExpectArrayComma) # expect value next!
  436. my.kind = JsonEventKind(ord(tk))
  437. of tkBracketLe:
  438. my.state.add(stateExpectArrayComma)
  439. my.state.add(stateArray)
  440. my.kind = jsonArrayStart
  441. of tkCurlyLe:
  442. my.state.add(stateExpectArrayComma)
  443. my.state.add(stateObject)
  444. my.kind = jsonObjectStart
  445. of tkBracketRi:
  446. my.kind = jsonArrayEnd
  447. discard my.state.pop()
  448. else:
  449. my.kind = jsonError
  450. my.err = errBracketRiExpected
  451. of stateExpectArrayComma:
  452. case tk
  453. of tkComma:
  454. discard my.state.pop()
  455. next(my)
  456. of tkBracketRi:
  457. my.kind = jsonArrayEnd
  458. discard my.state.pop() # pop stateExpectArrayComma
  459. discard my.state.pop() # pop stateArray
  460. else:
  461. my.kind = jsonError
  462. my.err = errBracketRiExpected
  463. of stateExpectObjectComma:
  464. case tk
  465. of tkComma:
  466. discard my.state.pop()
  467. next(my)
  468. of tkCurlyRi:
  469. my.kind = jsonObjectEnd
  470. discard my.state.pop() # pop stateExpectObjectComma
  471. discard my.state.pop() # pop stateObject
  472. else:
  473. my.kind = jsonError
  474. my.err = errCurlyRiExpected
  475. of stateExpectColon:
  476. case tk
  477. of tkColon:
  478. my.state[i] = stateExpectValue
  479. next(my)
  480. else:
  481. my.kind = jsonError
  482. my.err = errColonExpected
  483. of stateExpectValue:
  484. case tk
  485. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  486. my.state[i] = stateExpectObjectComma
  487. my.kind = JsonEventKind(ord(tk))
  488. of tkBracketLe:
  489. my.state[i] = stateExpectObjectComma
  490. my.state.add(stateArray)
  491. my.kind = jsonArrayStart
  492. of tkCurlyLe:
  493. my.state[i] = stateExpectObjectComma
  494. my.state.add(stateObject)
  495. my.kind = jsonObjectStart
  496. else:
  497. my.kind = jsonError
  498. my.err = errExprExpected
  499. proc raiseParseErr*(p: JsonParser, msg: string) {.noinline, noreturn.} =
  500. ## raises an `EJsonParsingError` exception.
  501. raise newException(JsonParsingError, errorMsgExpected(p, msg))
  502. proc eat*(p: var JsonParser, tok: TokKind) =
  503. if p.tok == tok: discard getTok(p)
  504. else: raiseParseErr(p, tokToStr[tok])