parsexml.nim 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2010 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
  10. ## parser.
  11. ## The only encoding that is supported is UTF-8. The parser has been designed
  12. ## to be somewhat error correcting, so that even most "wild HTML" found on the
  13. ## web can be parsed with it. **Note:** This parser does not check that each
  14. ## ``<tag>`` has a corresponding ``</tag>``! These checks have do be
  15. ## implemented by the client code for various reasons:
  16. ##
  17. ## * Old HTML contains tags that have no end tag: ``<br>`` for example.
  18. ## * HTML tags are case insensitive, XML tags are case sensitive. Since this
  19. ## library can parse both, only the client knows which comparison is to be
  20. ## used.
  21. ## * Thus the checks would have been very difficult to implement properly with
  22. ## little benefit, especially since they are simple to implement in the
  23. ## client. The client should use the `errorMsgExpected` proc to generate
  24. ## a nice error message that fits the other error messages this library
  25. ## creates.
  26. ##
  27. ##
  28. ## Example 1: Retrieve HTML title
  29. ## ==============================
  30. ##
  31. ## The file ``examples/htmltitle.nim`` demonstrates how to use the
  32. ## XML parser to accomplish a simple task: To determine the title of an HTML
  33. ## document.
  34. ##
  35. ## .. code-block:: nim
  36. ## :file: ../../examples/htmltitle.nim
  37. ##
  38. ##
  39. ## Example 2: Retrieve all HTML links
  40. ## ==================================
  41. ##
  42. ## The file ``examples/htmlrefs.nim`` demonstrates how to use the
  43. ## XML parser to accomplish another simple task: To determine all the links
  44. ## an HTML document contains.
  45. ##
  46. ## .. code-block:: nim
  47. ## :file: ../../examples/htmlrefs.nim
  48. ##
  49. import
  50. hashes, strutils, lexbase, streams, unicode
  51. # the parser treats ``<br />`` as ``<br></br>``
  52. # xmlElementCloseEnd, ## ``/>``
  53. type
  54. XmlEventKind* = enum ## enumation of all events that may occur when parsing
  55. xmlError, ## an error occurred during parsing
  56. xmlEof, ## end of file reached
  57. xmlCharData, ## character data
  58. xmlWhitespace, ## whitespace has been parsed
  59. xmlComment, ## a comment has been parsed
  60. xmlPI, ## processing instruction (``<?name something ?>``)
  61. xmlElementStart, ## ``<elem>``
  62. xmlElementEnd, ## ``</elem>``
  63. xmlElementOpen, ## ``<elem
  64. xmlAttribute, ## ``key = "value"`` pair
  65. xmlElementClose, ## ``>``
  66. xmlCData, ## ``<![CDATA[`` ... data ... ``]]>``
  67. xmlEntity, ## &entity;
  68. xmlSpecial ## ``<! ... data ... >``
  69. XmlErrorKind* = enum ## enumeration that lists all errors that can occur
  70. errNone, ## no error
  71. errEndOfCDataExpected, ## ``]]>`` expected
  72. errNameExpected, ## name expected
  73. errSemicolonExpected, ## ``;`` expected
  74. errQmGtExpected, ## ``?>`` expected
  75. errGtExpected, ## ``>`` expected
  76. errEqExpected, ## ``=`` expected
  77. errQuoteExpected, ## ``"`` or ``'`` expected
  78. errEndOfCommentExpected ## ``-->`` expected
  79. ParserState = enum
  80. stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
  81. XmlParseOption* = enum ## options for the XML parser
  82. reportWhitespace, ## report whitespace
  83. reportComments ## report comments
  84. XmlParser* = object of BaseLexer ## the parser object.
  85. a, b, c: string
  86. kind: XmlEventKind
  87. err: XmlErrorKind
  88. state: ParserState
  89. filename: string
  90. options: set[XmlParseOption]
  91. {.deprecated: [TXmlParser: XmlParser, TXmlParseOptions: XmlParseOption,
  92. TXmlError: XmlErrorKind, TXmlEventKind: XmlEventKind].}
  93. const
  94. errorMessages: array[XmlErrorKind, string] = [
  95. "no error",
  96. "']]>' expected",
  97. "name expected",
  98. "';' expected",
  99. "'?>' expected",
  100. "'>' expected",
  101. "'=' expected",
  102. "'\"' or \"'\" expected",
  103. "'-->' expected"
  104. ]
  105. proc open*(my: var XmlParser, input: Stream, filename: string,
  106. options: set[XmlParseOption] = {}) =
  107. ## initializes the parser with an input stream. `Filename` is only used
  108. ## for nice error messages. The parser's behaviour can be controlled by
  109. ## the `options` parameter: If `options` contains ``reportWhitespace``
  110. ## a whitespace token is reported as an ``xmlWhitespace`` event.
  111. ## If `options` contains ``reportComments`` a comment token is reported as an
  112. ## ``xmlComment`` event.
  113. lexbase.open(my, input, 8192, {'\c', '\L', '/'})
  114. my.filename = filename
  115. my.state = stateStart
  116. my.kind = xmlError
  117. my.a = ""
  118. my.b = ""
  119. my.c = nil
  120. my.options = options
  121. proc close*(my: var XmlParser) {.inline.} =
  122. ## closes the parser `my` and its associated input stream.
  123. lexbase.close(my)
  124. proc kind*(my: XmlParser): XmlEventKind {.inline.} =
  125. ## returns the current event type for the XML parser
  126. return my.kind
  127. template charData*(my: XmlParser): string =
  128. ## returns the character data for the events: ``xmlCharData``,
  129. ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
  130. ## Raises an assertion in debug mode if ``my.kind`` is not one
  131. ## of those events. In release mode, this will not trigger an error
  132. ## but the value returned will not be valid.
  133. assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
  134. xmlSpecial})
  135. my.a
  136. template elementName*(my: XmlParser): string =
  137. ## returns the element name for the events: ``xmlElementStart``,
  138. ## ``xmlElementEnd``, ``xmlElementOpen``
  139. ## Raises an assertion in debug mode if ``my.kind`` is not one
  140. ## of those events. In release mode, this will not trigger an error
  141. ## but the value returned will not be valid.
  142. assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
  143. my.a
  144. template entityName*(my: XmlParser): string =
  145. ## returns the entity name for the event: ``xmlEntity``
  146. ## Raises an assertion in debug mode if ``my.kind`` is not
  147. ## ``xmlEntity``. In release mode, this will not trigger an error
  148. ## but the value returned will not be valid.
  149. assert(my.kind == xmlEntity)
  150. my.a
  151. template attrKey*(my: XmlParser): string =
  152. ## returns the attribute key for the event ``xmlAttribute``
  153. ## Raises an assertion in debug mode if ``my.kind`` is not
  154. ## ``xmlAttribute``. In release mode, this will not trigger an error
  155. ## but the value returned will not be valid.
  156. assert(my.kind == xmlAttribute)
  157. my.a
  158. template attrValue*(my: XmlParser): string =
  159. ## returns the attribute value for the event ``xmlAttribute``
  160. ## Raises an assertion in debug mode if ``my.kind`` is not
  161. ## ``xmlAttribute``. In release mode, this will not trigger an error
  162. ## but the value returned will not be valid.
  163. assert(my.kind == xmlAttribute)
  164. my.b
  165. template piName*(my: XmlParser): string =
  166. ## returns the processing instruction name for the event ``xmlPI``
  167. ## Raises an assertion in debug mode if ``my.kind`` is not
  168. ## ``xmlPI``. In release mode, this will not trigger an error
  169. ## but the value returned will not be valid.
  170. assert(my.kind == xmlPI)
  171. my.a
  172. template piRest*(my: XmlParser): string =
  173. ## returns the rest of the processing instruction for the event ``xmlPI``
  174. ## Raises an assertion in debug mode if ``my.kind`` is not
  175. ## ``xmlPI``. In release mode, this will not trigger an error
  176. ## but the value returned will not be valid.
  177. assert(my.kind == xmlPI)
  178. my.b
  179. proc rawData*(my: XmlParser): string {.inline.} =
  180. ## returns the underlying 'data' string by reference.
  181. ## This is only used for speed hacks.
  182. shallowCopy(result, my.a)
  183. proc rawData2*(my: XmlParser): string {.inline.} =
  184. ## returns the underlying second 'data' string by reference.
  185. ## This is only used for speed hacks.
  186. shallowCopy(result, my.b)
  187. proc getColumn*(my: XmlParser): int {.inline.} =
  188. ## get the current column the parser has arrived at.
  189. result = getColNumber(my, my.bufpos)
  190. proc getLine*(my: XmlParser): int {.inline.} =
  191. ## get the current line the parser has arrived at.
  192. result = my.lineNumber
  193. proc getFilename*(my: XmlParser): string {.inline.} =
  194. ## get the filename of the file that the parser processes.
  195. result = my.filename
  196. proc errorMsg*(my: XmlParser): string =
  197. ## returns a helpful error message for the event ``xmlError``
  198. assert(my.kind == xmlError)
  199. result = "$1($2, $3) Error: $4" % [
  200. my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
  201. proc errorMsgExpected*(my: XmlParser, tag: string): string =
  202. ## returns an error message "<tag> expected" in the same format as the
  203. ## other error messages
  204. result = "$1($2, $3) Error: $4" % [
  205. my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
  206. proc errorMsg*(my: XmlParser, msg: string): string =
  207. ## returns an error message with text `msg` in the same format as the
  208. ## other error messages
  209. result = "$1($2, $3) Error: $4" % [
  210. my.filename, $getLine(my), $getColumn(my), msg]
  211. proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} =
  212. my.err = kind
  213. my.state = stateError
  214. proc parseCDATA(my: var XmlParser) =
  215. var pos = my.bufpos + len("<![CDATA[")
  216. var buf = my.buf
  217. while true:
  218. case buf[pos]
  219. of ']':
  220. if buf[pos+1] == ']' and buf[pos+2] == '>':
  221. inc(pos, 3)
  222. break
  223. add(my.a, ']')
  224. inc(pos)
  225. of '\0':
  226. markError(my, errEndOfCDataExpected)
  227. break
  228. of '\c':
  229. pos = lexbase.handleCR(my, pos)
  230. buf = my.buf
  231. add(my.a, '\L')
  232. of '\L':
  233. pos = lexbase.handleLF(my, pos)
  234. buf = my.buf
  235. add(my.a, '\L')
  236. of '/':
  237. pos = lexbase.handleRefillChar(my, pos)
  238. buf = my.buf
  239. add(my.a, '/')
  240. else:
  241. add(my.a, buf[pos])
  242. inc(pos)
  243. my.bufpos = pos # store back
  244. my.kind = xmlCData
  245. proc parseComment(my: var XmlParser) =
  246. var pos = my.bufpos + len("<!--")
  247. var buf = my.buf
  248. while true:
  249. case buf[pos]
  250. of '-':
  251. if buf[pos+1] == '-' and buf[pos+2] == '>':
  252. inc(pos, 3)
  253. break
  254. if my.options.contains(reportComments): add(my.a, '-')
  255. inc(pos)
  256. of '\0':
  257. markError(my, errEndOfCommentExpected)
  258. break
  259. of '\c':
  260. pos = lexbase.handleCR(my, pos)
  261. buf = my.buf
  262. if my.options.contains(reportComments): add(my.a, '\L')
  263. of '\L':
  264. pos = lexbase.handleLF(my, pos)
  265. buf = my.buf
  266. if my.options.contains(reportComments): add(my.a, '\L')
  267. of '/':
  268. pos = lexbase.handleRefillChar(my, pos)
  269. buf = my.buf
  270. if my.options.contains(reportComments): add(my.a, '/')
  271. else:
  272. if my.options.contains(reportComments): add(my.a, buf[pos])
  273. inc(pos)
  274. my.bufpos = pos
  275. my.kind = xmlComment
  276. proc parseWhitespace(my: var XmlParser, skip=false) =
  277. var pos = my.bufpos
  278. var buf = my.buf
  279. while true:
  280. case buf[pos]
  281. of ' ', '\t':
  282. if not skip: add(my.a, buf[pos])
  283. inc(pos)
  284. of '\c':
  285. # the specification says that CR-LF, CR are to be transformed to LF
  286. pos = lexbase.handleCR(my, pos)
  287. buf = my.buf
  288. if not skip: add(my.a, '\L')
  289. of '\L':
  290. pos = lexbase.handleLF(my, pos)
  291. buf = my.buf
  292. if not skip: add(my.a, '\L')
  293. else:
  294. break
  295. my.bufpos = pos
  296. const
  297. NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
  298. NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
  299. proc parseName(my: var XmlParser, dest: var string) =
  300. var pos = my.bufpos
  301. var buf = my.buf
  302. if buf[pos] in NameStartChar:
  303. while true:
  304. add(dest, buf[pos])
  305. inc(pos)
  306. if buf[pos] notin NameChar: break
  307. my.bufpos = pos
  308. else:
  309. markError(my, errNameExpected)
  310. proc parseEntity(my: var XmlParser, dest: var string) =
  311. var pos = my.bufpos+1
  312. var buf = my.buf
  313. my.kind = xmlCharData
  314. if buf[pos] == '#':
  315. var r: int
  316. inc(pos)
  317. if buf[pos] == 'x':
  318. inc(pos)
  319. while true:
  320. case buf[pos]
  321. of '0'..'9': r = (r shl 4) or (ord(buf[pos]) - ord('0'))
  322. of 'a'..'f': r = (r shl 4) or (ord(buf[pos]) - ord('a') + 10)
  323. of 'A'..'F': r = (r shl 4) or (ord(buf[pos]) - ord('A') + 10)
  324. else: break
  325. inc(pos)
  326. else:
  327. while buf[pos] in {'0'..'9'}:
  328. r = r * 10 + (ord(buf[pos]) - ord('0'))
  329. inc(pos)
  330. add(dest, toUTF8(Rune(r)))
  331. elif buf[pos] == 'l' and buf[pos+1] == 't' and buf[pos+2] == ';':
  332. add(dest, '<')
  333. inc(pos, 2)
  334. elif buf[pos] == 'g' and buf[pos+1] == 't' and buf[pos+2] == ';':
  335. add(dest, '>')
  336. inc(pos, 2)
  337. elif buf[pos] == 'a' and buf[pos+1] == 'm' and buf[pos+2] == 'p' and
  338. buf[pos+3] == ';':
  339. add(dest, '&')
  340. inc(pos, 3)
  341. elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and
  342. buf[pos+3] == 's' and buf[pos+4] == ';':
  343. add(dest, '\'')
  344. inc(pos, 4)
  345. elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and
  346. buf[pos+3] == 't' and buf[pos+4] == ';':
  347. add(dest, '"')
  348. inc(pos, 4)
  349. else:
  350. my.bufpos = pos
  351. parseName(my, dest)
  352. pos = my.bufpos
  353. if my.err != errNameExpected:
  354. my.kind = xmlEntity
  355. else:
  356. add(dest, '&')
  357. if buf[pos] == ';':
  358. inc(pos)
  359. else:
  360. markError(my, errSemicolonExpected)
  361. my.bufpos = pos
  362. proc parsePI(my: var XmlParser) =
  363. inc(my.bufpos, "<?".len)
  364. parseName(my, my.a)
  365. var pos = my.bufpos
  366. var buf = my.buf
  367. setLen(my.b, 0)
  368. while true:
  369. case buf[pos]
  370. of '\0':
  371. markError(my, errQmGtExpected)
  372. break
  373. of '?':
  374. if buf[pos+1] == '>':
  375. inc(pos, 2)
  376. break
  377. add(my.b, '?')
  378. inc(pos)
  379. of '\c':
  380. # the specification says that CR-LF, CR are to be transformed to LF
  381. pos = lexbase.handleCR(my, pos)
  382. buf = my.buf
  383. add(my.b, '\L')
  384. of '\L':
  385. pos = lexbase.handleLF(my, pos)
  386. buf = my.buf
  387. add(my.b, '\L')
  388. of '/':
  389. pos = lexbase.handleRefillChar(my, pos)
  390. buf = my.buf
  391. add(my.b, '/')
  392. else:
  393. add(my.b, buf[pos])
  394. inc(pos)
  395. my.bufpos = pos
  396. my.kind = xmlPI
  397. proc parseSpecial(my: var XmlParser) =
  398. # things that start with <!
  399. var pos = my.bufpos + 2
  400. var buf = my.buf
  401. var opentags = 0
  402. while true:
  403. case buf[pos]
  404. of '\0':
  405. markError(my, errGtExpected)
  406. break
  407. of '<':
  408. inc(opentags)
  409. inc(pos)
  410. add(my.a, '<')
  411. of '>':
  412. if opentags <= 0:
  413. inc(pos)
  414. break
  415. dec(opentags)
  416. inc(pos)
  417. add(my.a, '>')
  418. of '\c':
  419. pos = lexbase.handleCR(my, pos)
  420. buf = my.buf
  421. add(my.a, '\L')
  422. of '\L':
  423. pos = lexbase.handleLF(my, pos)
  424. buf = my.buf
  425. add(my.a, '\L')
  426. of '/':
  427. pos = lexbase.handleRefillChar(my, pos)
  428. buf = my.buf
  429. add(my.b, '/')
  430. else:
  431. add(my.a, buf[pos])
  432. inc(pos)
  433. my.bufpos = pos
  434. my.kind = xmlSpecial
  435. proc parseTag(my: var XmlParser) =
  436. inc(my.bufpos)
  437. parseName(my, my.a)
  438. # if we have no name, do not interpret the '<':
  439. if my.a.len == 0:
  440. my.kind = xmlCharData
  441. add(my.a, '<')
  442. return
  443. parseWhitespace(my, skip=true)
  444. if my.buf[my.bufpos] in NameStartChar:
  445. # an attribute follows:
  446. my.kind = xmlElementOpen
  447. my.state = stateAttr
  448. my.c = my.a # save for later
  449. else:
  450. my.kind = xmlElementStart
  451. let slash = my.buf[my.bufpos] == '/'
  452. if slash:
  453. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  454. if slash and my.buf[my.bufpos] == '>':
  455. inc(my.bufpos)
  456. my.state = stateEmptyElementTag
  457. my.c = nil
  458. elif my.buf[my.bufpos] == '>':
  459. inc(my.bufpos)
  460. else:
  461. markError(my, errGtExpected)
  462. proc parseEndTag(my: var XmlParser) =
  463. my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1)
  464. #inc(my.bufpos, 2)
  465. parseName(my, my.a)
  466. parseWhitespace(my, skip=true)
  467. if my.buf[my.bufpos] == '>':
  468. inc(my.bufpos)
  469. else:
  470. markError(my, errGtExpected)
  471. my.kind = xmlElementEnd
  472. proc parseAttribute(my: var XmlParser) =
  473. my.kind = xmlAttribute
  474. setLen(my.a, 0)
  475. setLen(my.b, 0)
  476. parseName(my, my.a)
  477. # if we have no name, we have '<tag attr= key %&$$%':
  478. if my.a.len == 0:
  479. markError(my, errGtExpected)
  480. return
  481. parseWhitespace(my, skip=true)
  482. if my.buf[my.bufpos] != '=':
  483. markError(my, errEqExpected)
  484. return
  485. inc(my.bufpos)
  486. parseWhitespace(my, skip=true)
  487. var pos = my.bufpos
  488. var buf = my.buf
  489. if buf[pos] in {'\'', '"'}:
  490. var quote = buf[pos]
  491. var pendingSpace = false
  492. inc(pos)
  493. while true:
  494. case buf[pos]
  495. of '\0':
  496. markError(my, errQuoteExpected)
  497. break
  498. of '&':
  499. if pendingSpace:
  500. add(my.b, ' ')
  501. pendingSpace = false
  502. my.bufpos = pos
  503. parseEntity(my, my.b)
  504. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  505. pos = my.bufpos
  506. of ' ', '\t':
  507. pendingSpace = true
  508. inc(pos)
  509. of '\c':
  510. pos = lexbase.handleCR(my, pos)
  511. buf = my.buf
  512. pendingSpace = true
  513. of '\L':
  514. pos = lexbase.handleLF(my, pos)
  515. buf = my.buf
  516. pendingSpace = true
  517. of '/':
  518. pos = lexbase.handleRefillChar(my, pos)
  519. buf = my.buf
  520. add(my.b, '/')
  521. else:
  522. if buf[pos] == quote:
  523. inc(pos)
  524. break
  525. else:
  526. if pendingSpace:
  527. add(my.b, ' ')
  528. pendingSpace = false
  529. add(my.b, buf[pos])
  530. inc(pos)
  531. else:
  532. markError(my, errQuoteExpected)
  533. # error corrections: guess what was meant
  534. while buf[pos] != '>' and buf[pos] > ' ':
  535. add(my.b, buf[pos])
  536. inc pos
  537. my.bufpos = pos
  538. parseWhitespace(my, skip=true)
  539. proc parseCharData(my: var XmlParser) =
  540. var pos = my.bufpos
  541. var buf = my.buf
  542. while true:
  543. case buf[pos]
  544. of '\0', '<', '&': break
  545. of '\c':
  546. # the specification says that CR-LF, CR are to be transformed to LF
  547. pos = lexbase.handleCR(my, pos)
  548. buf = my.buf
  549. add(my.a, '\L')
  550. of '\L':
  551. pos = lexbase.handleLF(my, pos)
  552. buf = my.buf
  553. add(my.a, '\L')
  554. of '/':
  555. pos = lexbase.handleRefillChar(my, pos)
  556. buf = my.buf
  557. add(my.a, '/')
  558. else:
  559. add(my.a, buf[pos])
  560. inc(pos)
  561. my.bufpos = pos
  562. my.kind = xmlCharData
  563. proc rawGetTok(my: var XmlParser) =
  564. my.kind = xmlError
  565. setLen(my.a, 0)
  566. var pos = my.bufpos
  567. var buf = my.buf
  568. case buf[pos]
  569. of '<':
  570. case buf[pos+1]
  571. of '/':
  572. parseEndTag(my)
  573. of '!':
  574. if buf[pos+2] == '[' and buf[pos+3] == 'C' and buf[pos+4] == 'D' and
  575. buf[pos+5] == 'A' and buf[pos+6] == 'T' and buf[pos+7] == 'A' and
  576. buf[pos+8] == '[':
  577. parseCDATA(my)
  578. elif buf[pos+2] == '-' and buf[pos+3] == '-':
  579. parseComment(my)
  580. else:
  581. parseSpecial(my)
  582. of '?':
  583. parsePI(my)
  584. else:
  585. parseTag(my)
  586. of ' ', '\t', '\c', '\l':
  587. parseWhitespace(my)
  588. my.kind = xmlWhitespace
  589. of '\0':
  590. my.kind = xmlEof
  591. of '&':
  592. parseEntity(my, my.a)
  593. else:
  594. parseCharData(my)
  595. assert my.kind != xmlError
  596. proc getTok(my: var XmlParser) =
  597. while true:
  598. let lastKind = my.kind
  599. rawGetTok(my)
  600. case my.kind
  601. of xmlComment:
  602. if my.options.contains(reportComments): break
  603. of xmlWhitespace:
  604. if my.options.contains(reportWhitespace) or lastKind in {xmlCharData, xmlComment, xmlEntity}:
  605. break
  606. else: break
  607. proc next*(my: var XmlParser) =
  608. ## retrieves the first/next event. This controls the parser.
  609. case my.state
  610. of stateNormal:
  611. getTok(my)
  612. of stateStart:
  613. my.state = stateNormal
  614. getTok(my)
  615. if my.kind == xmlPI and my.a == "xml":
  616. # just skip the first ``<?xml >`` processing instruction
  617. getTok(my)
  618. of stateAttr:
  619. # parse an attribute key-value pair:
  620. if my.buf[my.bufpos] == '>':
  621. my.kind = xmlElementClose
  622. inc(my.bufpos)
  623. my.state = stateNormal
  624. elif my.buf[my.bufpos] == '/':
  625. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  626. if my.buf[my.bufpos] == '>':
  627. my.kind = xmlElementClose
  628. inc(my.bufpos)
  629. my.state = stateEmptyElementTag
  630. else:
  631. markError(my, errGtExpected)
  632. else:
  633. parseAttribute(my)
  634. # state remains the same
  635. of stateEmptyElementTag:
  636. my.state = stateNormal
  637. my.kind = xmlElementEnd
  638. if not my.c.isNil:
  639. my.a = my.c
  640. of stateError:
  641. my.kind = xmlError
  642. my.state = stateNormal
  643. when not defined(testing) and isMainModule:
  644. import os
  645. var s = newFileStream(paramStr(1), fmRead)
  646. if s == nil: quit("cannot open the file" & paramStr(1))
  647. var x: XmlParser
  648. open(x, s, paramStr(1))
  649. while true:
  650. next(x)
  651. case x.kind
  652. of xmlError: echo(x.errorMsg())
  653. of xmlEof: break
  654. of xmlCharData: echo(x.charData)
  655. of xmlWhitespace: echo("|$1|" % x.charData)
  656. of xmlComment: echo("<!-- $1 -->" % x.charData)
  657. of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest])
  658. of xmlElementStart: echo("<$1>" % x.elementName)
  659. of xmlElementEnd: echo("</$1>" % x.elementName)
  660. of xmlElementOpen: echo("<$1" % x.elementName)
  661. of xmlAttribute:
  662. echo("Key: " & x.attrKey)
  663. echo("Value: " & x.attrValue)
  664. of xmlElementClose: echo(">")
  665. of xmlCData:
  666. echo("<![CDATA[$1]]>" % x.charData)
  667. of xmlEntity:
  668. echo("&$1;" % x.entityName)
  669. of xmlSpecial:
  670. echo("SPECIAL: " & x.charData)
  671. close(x)