parsejson.nim 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2018 Nim contributors
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a json parser. It is used
  10. ## and exported by the `json` standard library
  11. ## module, but can also be used in its own right.
  12. import strutils, lexbase, streams, unicode
  13. import std/private/decode_helpers
  14. type
  15. JsonEventKind* = enum ## enumeration of all events that may occur when parsing
  16. jsonError, ## an error occurred during parsing
  17. jsonEof, ## end of file reached
  18. jsonString, ## a string literal
  19. jsonInt, ## an integer literal
  20. jsonFloat, ## a float literal
  21. jsonTrue, ## the value `true`
  22. jsonFalse, ## the value `false`
  23. jsonNull, ## the value `null`
  24. jsonObjectStart, ## start of an object: the `{` token
  25. jsonObjectEnd, ## end of an object: the `}` token
  26. jsonArrayStart, ## start of an array: the `[` token
  27. jsonArrayEnd ## end of an array: the `]` token
  28. TokKind* = enum # must be synchronized with TJsonEventKind!
  29. tkError,
  30. tkEof,
  31. tkString,
  32. tkInt,
  33. tkFloat,
  34. tkTrue,
  35. tkFalse,
  36. tkNull,
  37. tkCurlyLe,
  38. tkCurlyRi,
  39. tkBracketLe,
  40. tkBracketRi,
  41. tkColon,
  42. tkComma
  43. JsonError* = enum ## enumeration that lists all errors that can occur
  44. errNone, ## no error
  45. errInvalidToken, ## invalid token
  46. errStringExpected, ## string expected
  47. errColonExpected, ## `:` expected
  48. errCommaExpected, ## `,` expected
  49. errBracketRiExpected, ## `]` expected
  50. errCurlyRiExpected, ## `}` expected
  51. errQuoteExpected, ## `"` or `'` expected
  52. errEOC_Expected, ## `*/` expected
  53. errEofExpected, ## EOF expected
  54. errExprExpected ## expr expected
  55. ParserState = enum
  56. stateEof, stateStart, stateObject, stateArray, stateExpectArrayComma,
  57. stateExpectObjectComma, stateExpectColon, stateExpectValue
  58. JsonParser* = object of BaseLexer ## the parser object.
  59. a*: string
  60. tok*: TokKind
  61. kind: JsonEventKind
  62. err: JsonError
  63. state: seq[ParserState]
  64. filename: string
  65. rawStringLiterals: bool
  66. JsonKindError* = object of ValueError ## raised by the `to` macro if the
  67. ## JSON kind is incorrect.
  68. JsonParsingError* = object of ValueError ## is raised for a JSON error
  69. const
  70. errorMessages*: array[JsonError, string] = [
  71. "no error",
  72. "invalid token",
  73. "string expected",
  74. "':' expected",
  75. "',' expected",
  76. "']' expected",
  77. "'}' expected",
  78. "'\"' or \"'\" expected",
  79. "'*/' expected",
  80. "EOF expected",
  81. "expression expected"
  82. ]
  83. tokToStr: array[TokKind, string] = [
  84. "invalid token",
  85. "EOF",
  86. "string literal",
  87. "int literal",
  88. "float literal",
  89. "true",
  90. "false",
  91. "null",
  92. "{", "}", "[", "]", ":", ","
  93. ]
  94. proc open*(my: var JsonParser, input: Stream, filename: string;
  95. rawStringLiterals = false) =
  96. ## initializes the parser with an input stream. `Filename` is only used
  97. ## for nice error messages. If `rawStringLiterals` is true, string literals
  98. ## are kept with their surrounding quotes and escape sequences in them are
  99. ## left untouched too.
  100. lexbase.open(my, input)
  101. my.filename = filename
  102. my.state = @[stateStart]
  103. my.kind = jsonError
  104. my.a = ""
  105. my.rawStringLiterals = rawStringLiterals
  106. proc close*(my: var JsonParser) {.inline.} =
  107. ## closes the parser `my` and its associated input stream.
  108. lexbase.close(my)
  109. proc str*(my: JsonParser): string {.inline.} =
  110. ## returns the character data for the events: `jsonInt`, `jsonFloat`,
  111. ## `jsonString`
  112. assert(my.kind in {jsonInt, jsonFloat, jsonString})
  113. return my.a
  114. proc getInt*(my: JsonParser): BiggestInt {.inline.} =
  115. ## returns the number for the event: `jsonInt`
  116. assert(my.kind == jsonInt)
  117. return parseBiggestInt(my.a)
  118. proc getFloat*(my: JsonParser): float {.inline.} =
  119. ## returns the number for the event: `jsonFloat`
  120. assert(my.kind == jsonFloat)
  121. return parseFloat(my.a)
  122. proc kind*(my: JsonParser): JsonEventKind {.inline.} =
  123. ## returns the current event type for the JSON parser
  124. return my.kind
  125. proc getColumn*(my: JsonParser): int {.inline.} =
  126. ## get the current column the parser has arrived at.
  127. result = getColNumber(my, my.bufpos)
  128. proc getLine*(my: JsonParser): int {.inline.} =
  129. ## get the current line the parser has arrived at.
  130. result = my.lineNumber
  131. proc getFilename*(my: JsonParser): string {.inline.} =
  132. ## get the filename of the file that the parser processes.
  133. result = my.filename
  134. proc errorMsg*(my: JsonParser): string =
  135. ## returns a helpful error message for the event `jsonError`
  136. assert(my.kind == jsonError)
  137. result = "$1($2, $3) Error: $4" % [
  138. my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
  139. proc errorMsgExpected*(my: JsonParser, e: string): string =
  140. ## returns an error message "`e` expected" in the same format as the
  141. ## other error messages
  142. result = "$1($2, $3) Error: $4" % [
  143. my.filename, $getLine(my), $getColumn(my), e & " expected"]
  144. proc parseEscapedUTF16*(buf: cstring, pos: var int): int =
  145. result = 0
  146. #UTF-16 escape is always 4 bytes.
  147. for _ in 0..3:
  148. # if char in '0' .. '9', 'a' .. 'f', 'A' .. 'F'
  149. if handleHexChar(buf[pos], result):
  150. inc(pos)
  151. else:
  152. return -1
  153. proc parseString(my: var JsonParser): TokKind =
  154. result = tkString
  155. var pos = my.bufpos + 1
  156. if my.rawStringLiterals:
  157. add(my.a, '"')
  158. while true:
  159. case my.buf[pos]
  160. of '\0':
  161. my.err = errQuoteExpected
  162. result = tkError
  163. break
  164. of '"':
  165. if my.rawStringLiterals:
  166. add(my.a, '"')
  167. inc(pos)
  168. break
  169. of '\\':
  170. if my.rawStringLiterals:
  171. add(my.a, '\\')
  172. case my.buf[pos+1]
  173. of '\\', '"', '\'', '/':
  174. add(my.a, my.buf[pos+1])
  175. inc(pos, 2)
  176. of 'b':
  177. add(my.a, '\b')
  178. inc(pos, 2)
  179. of 'f':
  180. add(my.a, '\f')
  181. inc(pos, 2)
  182. of 'n':
  183. add(my.a, '\L')
  184. inc(pos, 2)
  185. of 'r':
  186. add(my.a, '\C')
  187. inc(pos, 2)
  188. of 't':
  189. add(my.a, '\t')
  190. inc(pos, 2)
  191. of 'v':
  192. add(my.a, '\v')
  193. inc(pos, 2)
  194. of 'u':
  195. if my.rawStringLiterals:
  196. add(my.a, 'u')
  197. inc(pos, 2)
  198. var pos2 = pos
  199. var r = parseEscapedUTF16(my.buf, pos)
  200. if r < 0:
  201. my.err = errInvalidToken
  202. break
  203. # Deal with surrogates
  204. if (r and 0xfc00) == 0xd800:
  205. if my.buf[pos] != '\\' or my.buf[pos+1] != 'u':
  206. my.err = errInvalidToken
  207. break
  208. inc(pos, 2)
  209. var s = parseEscapedUTF16(my.buf, pos)
  210. if (s and 0xfc00) == 0xdc00 and s > 0:
  211. r = 0x10000 + (((r - 0xd800) shl 10) or (s - 0xdc00))
  212. else:
  213. my.err = errInvalidToken
  214. break
  215. if my.rawStringLiterals:
  216. let length = pos - pos2
  217. for i in 1 .. length:
  218. if my.buf[pos2] in {'0'..'9', 'A'..'F', 'a'..'f'}:
  219. add(my.a, my.buf[pos2])
  220. inc pos2
  221. else:
  222. break
  223. else:
  224. add(my.a, toUTF8(Rune(r)))
  225. else:
  226. # don't bother with the error
  227. add(my.a, my.buf[pos])
  228. inc(pos)
  229. of '\c':
  230. pos = lexbase.handleCR(my, pos)
  231. add(my.a, '\c')
  232. of '\L':
  233. pos = lexbase.handleLF(my, pos)
  234. add(my.a, '\L')
  235. else:
  236. add(my.a, my.buf[pos])
  237. inc(pos)
  238. my.bufpos = pos # store back
  239. proc skip(my: var JsonParser) =
  240. var pos = my.bufpos
  241. while true:
  242. case my.buf[pos]
  243. of '/':
  244. if my.buf[pos+1] == '/':
  245. # skip line comment:
  246. inc(pos, 2)
  247. while true:
  248. case my.buf[pos]
  249. of '\0':
  250. break
  251. of '\c':
  252. pos = lexbase.handleCR(my, pos)
  253. break
  254. of '\L':
  255. pos = lexbase.handleLF(my, pos)
  256. break
  257. else:
  258. inc(pos)
  259. elif my.buf[pos+1] == '*':
  260. # skip long comment:
  261. inc(pos, 2)
  262. while true:
  263. case my.buf[pos]
  264. of '\0':
  265. my.err = errEOC_Expected
  266. break
  267. of '\c':
  268. pos = lexbase.handleCR(my, pos)
  269. of '\L':
  270. pos = lexbase.handleLF(my, pos)
  271. of '*':
  272. inc(pos)
  273. if my.buf[pos] == '/':
  274. inc(pos)
  275. break
  276. else:
  277. inc(pos)
  278. else:
  279. break
  280. of ' ', '\t':
  281. inc(pos)
  282. of '\c':
  283. pos = lexbase.handleCR(my, pos)
  284. of '\L':
  285. pos = lexbase.handleLF(my, pos)
  286. else:
  287. break
  288. my.bufpos = pos
  289. proc parseNumber(my: var JsonParser) =
  290. var pos = my.bufpos
  291. if my.buf[pos] == '-':
  292. add(my.a, '-')
  293. inc(pos)
  294. if my.buf[pos] == '.':
  295. add(my.a, "0.")
  296. inc(pos)
  297. else:
  298. while my.buf[pos] in Digits:
  299. add(my.a, my.buf[pos])
  300. inc(pos)
  301. if my.buf[pos] == '.':
  302. add(my.a, '.')
  303. inc(pos)
  304. # digits after the dot:
  305. while my.buf[pos] in Digits:
  306. add(my.a, my.buf[pos])
  307. inc(pos)
  308. if my.buf[pos] in {'E', 'e'}:
  309. add(my.a, my.buf[pos])
  310. inc(pos)
  311. if my.buf[pos] in {'+', '-'}:
  312. add(my.a, my.buf[pos])
  313. inc(pos)
  314. while my.buf[pos] in Digits:
  315. add(my.a, my.buf[pos])
  316. inc(pos)
  317. my.bufpos = pos
  318. proc parseName(my: var JsonParser) =
  319. var pos = my.bufpos
  320. if my.buf[pos] in IdentStartChars:
  321. while my.buf[pos] in IdentChars:
  322. add(my.a, my.buf[pos])
  323. inc(pos)
  324. my.bufpos = pos
  325. proc getTok*(my: var JsonParser): TokKind =
  326. setLen(my.a, 0)
  327. skip(my) # skip whitespace, comments
  328. case my.buf[my.bufpos]
  329. of '-', '.', '0'..'9':
  330. parseNumber(my)
  331. if {'.', 'e', 'E'} in my.a:
  332. result = tkFloat
  333. else:
  334. result = tkInt
  335. of '"':
  336. result = parseString(my)
  337. of '[':
  338. inc(my.bufpos)
  339. result = tkBracketLe
  340. of '{':
  341. inc(my.bufpos)
  342. result = tkCurlyLe
  343. of ']':
  344. inc(my.bufpos)
  345. result = tkBracketRi
  346. of '}':
  347. inc(my.bufpos)
  348. result = tkCurlyRi
  349. of ',':
  350. inc(my.bufpos)
  351. result = tkComma
  352. of ':':
  353. inc(my.bufpos)
  354. result = tkColon
  355. of '\0':
  356. result = tkEof
  357. of 'a'..'z', 'A'..'Z', '_':
  358. parseName(my)
  359. case my.a
  360. of "null": result = tkNull
  361. of "true": result = tkTrue
  362. of "false": result = tkFalse
  363. else: result = tkError
  364. else:
  365. inc(my.bufpos)
  366. result = tkError
  367. my.tok = result
  368. proc next*(my: var JsonParser) =
  369. ## retrieves the first/next event. This controls the parser.
  370. var tk = getTok(my)
  371. var i = my.state.len-1
  372. # the following code is a state machine. If we had proper coroutines,
  373. # the code could be much simpler.
  374. case my.state[i]
  375. of stateEof:
  376. if tk == tkEof:
  377. my.kind = jsonEof
  378. else:
  379. my.kind = jsonError
  380. my.err = errEofExpected
  381. of stateStart:
  382. # tokens allowed?
  383. case tk
  384. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  385. my.state[i] = stateEof # expect EOF next!
  386. my.kind = JsonEventKind(ord(tk))
  387. of tkBracketLe:
  388. my.state.add(stateArray) # we expect any
  389. my.kind = jsonArrayStart
  390. of tkCurlyLe:
  391. my.state.add(stateObject)
  392. my.kind = jsonObjectStart
  393. of tkEof:
  394. my.kind = jsonEof
  395. else:
  396. my.kind = jsonError
  397. my.err = errEofExpected
  398. of stateObject:
  399. case tk
  400. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  401. my.state.add(stateExpectColon)
  402. my.kind = JsonEventKind(ord(tk))
  403. of tkBracketLe:
  404. my.state.add(stateExpectColon)
  405. my.state.add(stateArray)
  406. my.kind = jsonArrayStart
  407. of tkCurlyLe:
  408. my.state.add(stateExpectColon)
  409. my.state.add(stateObject)
  410. my.kind = jsonObjectStart
  411. of tkCurlyRi:
  412. my.kind = jsonObjectEnd
  413. discard my.state.pop()
  414. else:
  415. my.kind = jsonError
  416. my.err = errCurlyRiExpected
  417. of stateArray:
  418. case tk
  419. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  420. my.state.add(stateExpectArrayComma) # expect value next!
  421. my.kind = JsonEventKind(ord(tk))
  422. of tkBracketLe:
  423. my.state.add(stateExpectArrayComma)
  424. my.state.add(stateArray)
  425. my.kind = jsonArrayStart
  426. of tkCurlyLe:
  427. my.state.add(stateExpectArrayComma)
  428. my.state.add(stateObject)
  429. my.kind = jsonObjectStart
  430. of tkBracketRi:
  431. my.kind = jsonArrayEnd
  432. discard my.state.pop()
  433. else:
  434. my.kind = jsonError
  435. my.err = errBracketRiExpected
  436. of stateExpectArrayComma:
  437. case tk
  438. of tkComma:
  439. discard my.state.pop()
  440. next(my)
  441. of tkBracketRi:
  442. my.kind = jsonArrayEnd
  443. discard my.state.pop() # pop stateExpectArrayComma
  444. discard my.state.pop() # pop stateArray
  445. else:
  446. my.kind = jsonError
  447. my.err = errBracketRiExpected
  448. of stateExpectObjectComma:
  449. case tk
  450. of tkComma:
  451. discard my.state.pop()
  452. next(my)
  453. of tkCurlyRi:
  454. my.kind = jsonObjectEnd
  455. discard my.state.pop() # pop stateExpectObjectComma
  456. discard my.state.pop() # pop stateObject
  457. else:
  458. my.kind = jsonError
  459. my.err = errCurlyRiExpected
  460. of stateExpectColon:
  461. case tk
  462. of tkColon:
  463. my.state[i] = stateExpectValue
  464. next(my)
  465. else:
  466. my.kind = jsonError
  467. my.err = errColonExpected
  468. of stateExpectValue:
  469. case tk
  470. of tkString, tkInt, tkFloat, tkTrue, tkFalse, tkNull:
  471. my.state[i] = stateExpectObjectComma
  472. my.kind = JsonEventKind(ord(tk))
  473. of tkBracketLe:
  474. my.state[i] = stateExpectObjectComma
  475. my.state.add(stateArray)
  476. my.kind = jsonArrayStart
  477. of tkCurlyLe:
  478. my.state[i] = stateExpectObjectComma
  479. my.state.add(stateObject)
  480. my.kind = jsonObjectStart
  481. else:
  482. my.kind = jsonError
  483. my.err = errExprExpected
  484. proc raiseParseErr*(p: JsonParser, msg: string) {.noinline, noreturn.} =
  485. ## raises an `EJsonParsingError` exception.
  486. raise newException(JsonParsingError, errorMsgExpected(p, msg))
  487. proc eat*(p: var JsonParser, tok: TokKind) =
  488. if p.tok == tok: discard getTok(p)
  489. else: raiseParseErr(p, tokToStr[tok])