parsexml.nim 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2010 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
  10. ## parser.
  11. ## The only encoding that is supported is UTF-8. The parser has been designed
  12. ## to be somewhat error correcting, so that even most "wild HTML" found on the
  13. ## web can be parsed with it. **Note:** This parser does not check that each
  14. ## ``<tag>`` has a corresponding ``</tag>``! These checks have do be
  15. ## implemented by the client code for various reasons:
  16. ##
  17. ## * Old HTML contains tags that have no end tag: ``<br>`` for example.
  18. ## * HTML tags are case insensitive, XML tags are case sensitive. Since this
  19. ## library can parse both, only the client knows which comparison is to be
  20. ## used.
  21. ## * Thus the checks would have been very difficult to implement properly with
  22. ## little benefit, especially since they are simple to implement in the
  23. ## client. The client should use the `errorMsgExpected` proc to generate
  24. ## a nice error message that fits the other error messages this library
  25. ## creates.
  26. ##
  27. ##
  28. ##[
  29. Example 1: Retrieve HTML title
  30. ==============================
  31. The file ``examples/htmltitle.nim`` demonstrates how to use the
  32. XML parser to accomplish a simple task: To determine the title of an HTML
  33. document.
  34. ```nim
  35. # Example program to show the parsexml module
  36. # This program reads an HTML file and writes its title to stdout.
  37. # Errors and whitespace are ignored.
  38. import std/[os, streams, parsexml, strutils]
  39. if paramCount() < 1:
  40. quit("Usage: htmltitle filename[.html]")
  41. var filename = addFileExt(paramStr(1), "html")
  42. var s = newFileStream(filename, fmRead)
  43. if s == nil: quit("cannot open the file " & filename)
  44. var x: XmlParser
  45. open(x, s, filename)
  46. while true:
  47. x.next()
  48. case x.kind
  49. of xmlElementStart:
  50. if cmpIgnoreCase(x.elementName, "title") == 0:
  51. var title = ""
  52. x.next() # skip "<title>"
  53. while x.kind == xmlCharData:
  54. title.add(x.charData)
  55. x.next()
  56. if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0:
  57. echo("Title: " & title)
  58. quit(0) # Success!
  59. else:
  60. echo(x.errorMsgExpected("/title"))
  61. of xmlEof: break # end of file reached
  62. else: discard # ignore other events
  63. x.close()
  64. quit("Could not determine title!")
  65. ```
  66. ]##
  67. ##[
  68. Example 2: Retrieve all HTML links
  69. ==================================
  70. The file ``examples/htmlrefs.nim`` demonstrates how to use the
  71. XML parser to accomplish another simple task: To determine all the links
  72. an HTML document contains.
  73. ```nim
  74. # Example program to show the new parsexml module
  75. # This program reads an HTML file and writes all its used links to stdout.
  76. # Errors and whitespace are ignored.
  77. import std/[os, streams, parsexml, strutils]
  78. proc `=?=` (a, b: string): bool =
  79. # little trick: define our own comparator that ignores case
  80. return cmpIgnoreCase(a, b) == 0
  81. if paramCount() < 1:
  82. quit("Usage: htmlrefs filename[.html]")
  83. var links = 0 # count the number of links
  84. var filename = addFileExt(paramStr(1), "html")
  85. var s = newFileStream(filename, fmRead)
  86. if s == nil: quit("cannot open the file " & filename)
  87. var x: XmlParser
  88. open(x, s, filename)
  89. next(x) # get first event
  90. block mainLoop:
  91. while true:
  92. case x.kind
  93. of xmlElementOpen:
  94. # the <a href = "xyz"> tag we are interested in always has an attribute,
  95. # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
  96. if x.elementName =?= "a":
  97. x.next()
  98. if x.kind == xmlAttribute:
  99. if x.attrKey =?= "href":
  100. var link = x.attrValue
  101. inc(links)
  102. # skip until we have an ``xmlElementClose`` event
  103. while true:
  104. x.next()
  105. case x.kind
  106. of xmlEof: break mainLoop
  107. of xmlElementClose: break
  108. else: discard
  109. x.next() # skip ``xmlElementClose``
  110. # now we have the description for the ``a`` element
  111. var desc = ""
  112. while x.kind == xmlCharData:
  113. desc.add(x.charData)
  114. x.next()
  115. echo(desc & ": " & link)
  116. else:
  117. x.next()
  118. of xmlEof: break # end of file reached
  119. of xmlError:
  120. echo(errorMsg(x))
  121. x.next()
  122. else: x.next() # skip other events
  123. echo($links & " link(s) found!")
  124. x.close()
  125. ```
  126. ]##
  127. import
  128. std/[strutils, lexbase, streams, unicode]
  129. when defined(nimPreviewSlimSystem):
  130. import std/[assertions, syncio]
  131. # the parser treats ``<br />`` as ``<br></br>``
  132. # xmlElementCloseEnd, ## ``/>``
  133. type
  134. XmlEventKind* = enum ## enumeration of all events that may occur when parsing
  135. xmlError, ## an error occurred during parsing
  136. xmlEof, ## end of file reached
  137. xmlCharData, ## character data
  138. xmlWhitespace, ## whitespace has been parsed
  139. xmlComment, ## a comment has been parsed
  140. xmlPI, ## processing instruction (``<?name something ?>``)
  141. xmlElementStart, ## ``<elem>``
  142. xmlElementEnd, ## ``</elem>``
  143. xmlElementOpen, ## ``<elem
  144. xmlAttribute, ## ``key = "value"`` pair
  145. xmlElementClose, ## ``>``
  146. xmlCData, ## ``<![CDATA[`` ... data ... ``]]>``
  147. xmlEntity, ## &entity;
  148. xmlSpecial ## ``<! ... data ... >``
  149. XmlErrorKind* = enum ## enumeration that lists all errors that can occur
  150. errNone, ## no error
  151. errEndOfCDataExpected, ## ``]]>`` expected
  152. errNameExpected, ## name expected
  153. errSemicolonExpected, ## ``;`` expected
  154. errQmGtExpected, ## ``?>`` expected
  155. errGtExpected, ## ``>`` expected
  156. errEqExpected, ## ``=`` expected
  157. errQuoteExpected, ## ``"`` or ``'`` expected
  158. errEndOfCommentExpected ## ``-->`` expected
  159. errAttributeValueExpected ## non-empty attribute value expected
  160. ParserState = enum
  161. stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
  162. XmlParseOption* = enum ## options for the XML parser
  163. reportWhitespace, ## report whitespace
  164. reportComments ## report comments
  165. allowUnquotedAttribs ## allow unquoted attribute values (for HTML)
  166. allowEmptyAttribs ## allow empty attributes (without explicit value)
  167. XmlParser* = object of BaseLexer ## the parser object.
  168. a, b, c: string
  169. kind: XmlEventKind
  170. err: XmlErrorKind
  171. state: ParserState
  172. cIsEmpty: bool
  173. filename: string
  174. options: set[XmlParseOption]
  175. const
  176. errorMessages: array[XmlErrorKind, string] = [
  177. "no error",
  178. "']]>' expected",
  179. "name expected",
  180. "';' expected",
  181. "'?>' expected",
  182. "'>' expected",
  183. "'=' expected",
  184. "'\"' or \"'\" expected",
  185. "'-->' expected",
  186. "attribute value expected"
  187. ]
  188. proc open*(my: var XmlParser, input: Stream, filename: string,
  189. options: set[XmlParseOption] = {}) =
  190. ## initializes the parser with an input stream. `Filename` is only used
  191. ## for nice error messages. The parser's behaviour can be controlled by
  192. ## the `options` parameter: If `options` contains ``reportWhitespace``
  193. ## a whitespace token is reported as an ``xmlWhitespace`` event.
  194. ## If `options` contains ``reportComments`` a comment token is reported as an
  195. ## ``xmlComment`` event.
  196. lexbase.open(my, input, 8192, {'\c', '\L', '/'})
  197. my.filename = filename
  198. my.state = stateStart
  199. my.kind = xmlError
  200. my.a = ""
  201. my.b = ""
  202. my.c = ""
  203. my.cIsEmpty = true
  204. my.options = options
  205. proc close*(my: var XmlParser) {.inline.} =
  206. ## closes the parser `my` and its associated input stream.
  207. lexbase.close(my)
  208. proc kind*(my: XmlParser): XmlEventKind {.inline.} =
  209. ## returns the current event type for the XML parser
  210. return my.kind
  211. template charData*(my: XmlParser): string =
  212. ## returns the character data for the events: ``xmlCharData``,
  213. ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
  214. ## Raises an assertion in debug mode if ``my.kind`` is not one
  215. ## of those events. In release mode, this will not trigger an error
  216. ## but the value returned will not be valid.
  217. assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
  218. xmlSpecial})
  219. my.a
  220. template elementName*(my: XmlParser): string =
  221. ## returns the element name for the events: ``xmlElementStart``,
  222. ## ``xmlElementEnd``, ``xmlElementOpen``
  223. ## Raises an assertion in debug mode if ``my.kind`` is not one
  224. ## of those events. In release mode, this will not trigger an error
  225. ## but the value returned will not be valid.
  226. assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
  227. my.a
  228. template entityName*(my: XmlParser): string =
  229. ## returns the entity name for the event: ``xmlEntity``
  230. ## Raises an assertion in debug mode if ``my.kind`` is not
  231. ## ``xmlEntity``. In release mode, this will not trigger an error
  232. ## but the value returned will not be valid.
  233. assert(my.kind == xmlEntity)
  234. my.a
  235. template attrKey*(my: XmlParser): string =
  236. ## returns the attribute key for the event ``xmlAttribute``
  237. ## Raises an assertion in debug mode if ``my.kind`` is not
  238. ## ``xmlAttribute``. In release mode, this will not trigger an error
  239. ## but the value returned will not be valid.
  240. assert(my.kind == xmlAttribute)
  241. my.a
  242. template attrValue*(my: XmlParser): string =
  243. ## returns the attribute value for the event ``xmlAttribute``
  244. ## Raises an assertion in debug mode if ``my.kind`` is not
  245. ## ``xmlAttribute``. In release mode, this will not trigger an error
  246. ## but the value returned will not be valid.
  247. assert(my.kind == xmlAttribute)
  248. my.b
  249. template piName*(my: XmlParser): string =
  250. ## returns the processing instruction name for the event ``xmlPI``
  251. ## Raises an assertion in debug mode if ``my.kind`` is not
  252. ## ``xmlPI``. In release mode, this will not trigger an error
  253. ## but the value returned will not be valid.
  254. assert(my.kind == xmlPI)
  255. my.a
  256. template piRest*(my: XmlParser): string =
  257. ## returns the rest of the processing instruction for the event ``xmlPI``
  258. ## Raises an assertion in debug mode if ``my.kind`` is not
  259. ## ``xmlPI``. In release mode, this will not trigger an error
  260. ## but the value returned will not be valid.
  261. assert(my.kind == xmlPI)
  262. my.b
  263. proc rawData*(my: var XmlParser): lent string {.inline.} =
  264. ## returns the underlying 'data' string by reference.
  265. ## This is only used for speed hacks.
  266. result = my.a
  267. proc rawData2*(my: var XmlParser): lent string {.inline.} =
  268. ## returns the underlying second 'data' string by reference.
  269. ## This is only used for speed hacks.
  270. result = my.b
  271. proc getColumn*(my: XmlParser): int {.inline.} =
  272. ## get the current column the parser has arrived at.
  273. result = getColNumber(my, my.bufpos)
  274. proc getLine*(my: XmlParser): int {.inline.} =
  275. ## get the current line the parser has arrived at.
  276. result = my.lineNumber
  277. proc getFilename*(my: XmlParser): string {.inline.} =
  278. ## get the filename of the file that the parser processes.
  279. result = my.filename
  280. proc errorMsg*(my: XmlParser): string =
  281. ## returns a helpful error message for the event ``xmlError``
  282. assert(my.kind == xmlError)
  283. result = "$1($2, $3) Error: $4" % [
  284. my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
  285. proc errorMsgExpected*(my: XmlParser, tag: string): string =
  286. ## returns an error message "<tag> expected" in the same format as the
  287. ## other error messages
  288. result = "$1($2, $3) Error: $4" % [
  289. my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
  290. proc errorMsg*(my: XmlParser, msg: string): string =
  291. ## returns an error message with text `msg` in the same format as the
  292. ## other error messages
  293. result = "$1($2, $3) Error: $4" % [
  294. my.filename, $getLine(my), $getColumn(my), msg]
  295. proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} =
  296. my.err = kind
  297. my.state = stateError
  298. proc parseCDATA(my: var XmlParser) =
  299. var pos = my.bufpos + len("<![CDATA[")
  300. while true:
  301. case my.buf[pos]
  302. of ']':
  303. if my.buf[pos+1] == ']' and my.buf[pos+2] == '>':
  304. inc(pos, 3)
  305. break
  306. add(my.a, ']')
  307. inc(pos)
  308. of '\0':
  309. markError(my, errEndOfCDataExpected)
  310. break
  311. of '\c':
  312. pos = lexbase.handleCR(my, pos)
  313. add(my.a, '\L')
  314. of '\L':
  315. pos = lexbase.handleLF(my, pos)
  316. add(my.a, '\L')
  317. of '/':
  318. pos = lexbase.handleRefillChar(my, pos)
  319. add(my.a, '/')
  320. else:
  321. add(my.a, my.buf[pos])
  322. inc(pos)
  323. my.bufpos = pos # store back
  324. my.kind = xmlCData
  325. proc parseComment(my: var XmlParser) =
  326. var pos = my.bufpos + len("<!--")
  327. while true:
  328. case my.buf[pos]
  329. of '-':
  330. if my.buf[pos+1] == '-' and my.buf[pos+2] == '>':
  331. inc(pos, 3)
  332. break
  333. if my.options.contains(reportComments): add(my.a, '-')
  334. inc(pos)
  335. of '\0':
  336. markError(my, errEndOfCommentExpected)
  337. break
  338. of '\c':
  339. pos = lexbase.handleCR(my, pos)
  340. if my.options.contains(reportComments): add(my.a, '\L')
  341. of '\L':
  342. pos = lexbase.handleLF(my, pos)
  343. if my.options.contains(reportComments): add(my.a, '\L')
  344. of '/':
  345. pos = lexbase.handleRefillChar(my, pos)
  346. if my.options.contains(reportComments): add(my.a, '/')
  347. else:
  348. if my.options.contains(reportComments): add(my.a, my.buf[pos])
  349. inc(pos)
  350. my.bufpos = pos
  351. my.kind = xmlComment
  352. proc parseWhitespace(my: var XmlParser, skip = false) =
  353. var pos = my.bufpos
  354. while true:
  355. case my.buf[pos]
  356. of ' ', '\t':
  357. if not skip: add(my.a, my.buf[pos])
  358. inc(pos)
  359. of '\c':
  360. # the specification says that CR-LF, CR are to be transformed to LF
  361. pos = lexbase.handleCR(my, pos)
  362. if not skip: add(my.a, '\L')
  363. of '\L':
  364. pos = lexbase.handleLF(my, pos)
  365. if not skip: add(my.a, '\L')
  366. else:
  367. break
  368. my.bufpos = pos
  369. const
  370. NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
  371. NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
  372. proc parseName(my: var XmlParser, dest: var string) =
  373. var pos = my.bufpos
  374. if my.buf[pos] in NameStartChar:
  375. while true:
  376. add(dest, my.buf[pos])
  377. inc(pos)
  378. if my.buf[pos] notin NameChar: break
  379. my.bufpos = pos
  380. else:
  381. markError(my, errNameExpected)
  382. proc parseEntity(my: var XmlParser, dest: var string) =
  383. var pos = my.bufpos+1
  384. my.kind = xmlCharData
  385. if my.buf[pos] == '#':
  386. var r: int
  387. inc(pos)
  388. if my.buf[pos] == 'x':
  389. inc(pos)
  390. while true:
  391. case my.buf[pos]
  392. of '0'..'9': r = (r shl 4) or (ord(my.buf[pos]) - ord('0'))
  393. of 'a'..'f': r = (r shl 4) or (ord(my.buf[pos]) - ord('a') + 10)
  394. of 'A'..'F': r = (r shl 4) or (ord(my.buf[pos]) - ord('A') + 10)
  395. else: break
  396. inc(pos)
  397. else:
  398. while my.buf[pos] in {'0'..'9'}:
  399. r = r * 10 + (ord(my.buf[pos]) - ord('0'))
  400. inc(pos)
  401. add(dest, toUTF8(Rune(r)))
  402. elif my.buf[pos] == 'l' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';':
  403. add(dest, '<')
  404. inc(pos, 2)
  405. elif my.buf[pos] == 'g' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';':
  406. add(dest, '>')
  407. inc(pos, 2)
  408. elif my.buf[pos] == 'a' and my.buf[pos+1] == 'm' and my.buf[pos+2] == 'p' and
  409. my.buf[pos+3] == ';':
  410. add(dest, '&')
  411. inc(pos, 3)
  412. elif my.buf[pos] == 'a' and my.buf[pos+1] == 'p' and my.buf[pos+2] == 'o' and
  413. my.buf[pos+3] == 's' and my.buf[pos+4] == ';':
  414. add(dest, '\'')
  415. inc(pos, 4)
  416. elif my.buf[pos] == 'q' and my.buf[pos+1] == 'u' and my.buf[pos+2] == 'o' and
  417. my.buf[pos+3] == 't' and my.buf[pos+4] == ';':
  418. add(dest, '"')
  419. inc(pos, 4)
  420. else:
  421. my.bufpos = pos
  422. var name = ""
  423. parseName(my, name)
  424. pos = my.bufpos
  425. if my.err != errNameExpected and my.buf[pos] == ';':
  426. my.kind = xmlEntity
  427. else:
  428. add(dest, '&')
  429. add(dest, name)
  430. if my.buf[pos] == ';':
  431. inc(pos)
  432. else:
  433. my.err = errSemicolonExpected
  434. # do not overwrite 'my.state' here, it's a benign error
  435. my.bufpos = pos
  436. proc parsePI(my: var XmlParser) =
  437. inc(my.bufpos, "<?".len)
  438. parseName(my, my.a)
  439. var pos = my.bufpos
  440. setLen(my.b, 0)
  441. while true:
  442. case my.buf[pos]
  443. of '\0':
  444. markError(my, errQmGtExpected)
  445. break
  446. of '?':
  447. if my.buf[pos+1] == '>':
  448. inc(pos, 2)
  449. break
  450. add(my.b, '?')
  451. inc(pos)
  452. of '\c':
  453. # the specification says that CR-LF, CR are to be transformed to LF
  454. pos = lexbase.handleCR(my, pos)
  455. add(my.b, '\L')
  456. of '\L':
  457. pos = lexbase.handleLF(my, pos)
  458. add(my.b, '\L')
  459. of '/':
  460. pos = lexbase.handleRefillChar(my, pos)
  461. add(my.b, '/')
  462. else:
  463. add(my.b, my.buf[pos])
  464. inc(pos)
  465. my.bufpos = pos
  466. my.kind = xmlPI
  467. proc parseSpecial(my: var XmlParser) =
  468. # things that start with <!
  469. var pos = my.bufpos + 2
  470. var opentags = 0
  471. while true:
  472. case my.buf[pos]
  473. of '\0':
  474. markError(my, errGtExpected)
  475. break
  476. of '<':
  477. inc(opentags)
  478. inc(pos)
  479. add(my.a, '<')
  480. of '>':
  481. if opentags <= 0:
  482. inc(pos)
  483. break
  484. dec(opentags)
  485. inc(pos)
  486. add(my.a, '>')
  487. of '\c':
  488. pos = lexbase.handleCR(my, pos)
  489. add(my.a, '\L')
  490. of '\L':
  491. pos = lexbase.handleLF(my, pos)
  492. add(my.a, '\L')
  493. of '/':
  494. pos = lexbase.handleRefillChar(my, pos)
  495. add(my.b, '/')
  496. else:
  497. add(my.a, my.buf[pos])
  498. inc(pos)
  499. my.bufpos = pos
  500. my.kind = xmlSpecial
  501. proc parseTag(my: var XmlParser) =
  502. inc(my.bufpos)
  503. parseName(my, my.a)
  504. # if we have no name, do not interpret the '<':
  505. if my.a.len == 0:
  506. my.kind = xmlCharData
  507. add(my.a, '<')
  508. return
  509. parseWhitespace(my, skip = true)
  510. if my.buf[my.bufpos] in NameStartChar:
  511. # an attribute follows:
  512. my.kind = xmlElementOpen
  513. my.state = stateAttr
  514. my.c = my.a # save for later
  515. my.cIsEmpty = false
  516. else:
  517. my.kind = xmlElementStart
  518. let slash = my.buf[my.bufpos] == '/'
  519. if slash:
  520. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  521. if slash and my.buf[my.bufpos] == '>':
  522. inc(my.bufpos)
  523. my.state = stateEmptyElementTag
  524. my.c = ""
  525. my.cIsEmpty = true
  526. elif my.buf[my.bufpos] == '>':
  527. inc(my.bufpos)
  528. else:
  529. markError(my, errGtExpected)
  530. proc parseEndTag(my: var XmlParser) =
  531. my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1)
  532. #inc(my.bufpos, 2)
  533. parseName(my, my.a)
  534. parseWhitespace(my, skip = true)
  535. if my.buf[my.bufpos] == '>':
  536. inc(my.bufpos)
  537. else:
  538. markError(my, errGtExpected)
  539. my.kind = xmlElementEnd
  540. proc parseAttribute(my: var XmlParser) =
  541. my.kind = xmlAttribute
  542. setLen(my.a, 0)
  543. setLen(my.b, 0)
  544. parseName(my, my.a)
  545. # if we have no name, we have '<tag attr= key %&$$%':
  546. if my.a.len == 0:
  547. markError(my, errGtExpected)
  548. return
  549. let startPos = my.bufpos
  550. parseWhitespace(my, skip = true)
  551. if my.buf[my.bufpos] != '=':
  552. if allowEmptyAttribs notin my.options or
  553. (my.buf[my.bufpos] != '>' and my.bufpos == startPos):
  554. markError(my, errEqExpected)
  555. return
  556. inc(my.bufpos)
  557. parseWhitespace(my, skip = true)
  558. var pos = my.bufpos
  559. if my.buf[pos] in {'\'', '"'}:
  560. var quote = my.buf[pos]
  561. var pendingSpace = false
  562. inc(pos)
  563. while true:
  564. case my.buf[pos]
  565. of '\0':
  566. markError(my, errQuoteExpected)
  567. break
  568. of '&':
  569. if pendingSpace:
  570. add(my.b, ' ')
  571. pendingSpace = false
  572. my.bufpos = pos
  573. parseEntity(my, my.b)
  574. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  575. pos = my.bufpos
  576. of ' ', '\t':
  577. pendingSpace = true
  578. inc(pos)
  579. of '\c':
  580. pos = lexbase.handleCR(my, pos)
  581. pendingSpace = true
  582. of '\L':
  583. pos = lexbase.handleLF(my, pos)
  584. pendingSpace = true
  585. of '/':
  586. pos = lexbase.handleRefillChar(my, pos)
  587. add(my.b, '/')
  588. else:
  589. if my.buf[pos] == quote:
  590. inc(pos)
  591. break
  592. else:
  593. if pendingSpace:
  594. add(my.b, ' ')
  595. pendingSpace = false
  596. add(my.b, my.buf[pos])
  597. inc(pos)
  598. elif allowUnquotedAttribs in my.options:
  599. const disallowedChars = {'"', '\'', '`', '=', '<', '>', ' ',
  600. '\0', '\t', '\L', '\F', '\f'}
  601. let startPos = pos
  602. while (let c = my.buf[pos]; c notin disallowedChars):
  603. if c == '&':
  604. my.bufpos = pos
  605. parseEntity(my, my.b)
  606. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  607. pos = my.bufpos
  608. elif c == '/':
  609. pos = lexbase.handleRefillChar(my, pos)
  610. add(my.b, '/')
  611. else:
  612. add(my.b, c)
  613. inc(pos)
  614. if pos == startPos:
  615. markError(my, errAttributeValueExpected)
  616. else:
  617. markError(my, errQuoteExpected)
  618. # error corrections: guess what was meant
  619. while my.buf[pos] != '>' and my.buf[pos] > ' ':
  620. add(my.b, my.buf[pos])
  621. inc pos
  622. my.bufpos = pos
  623. parseWhitespace(my, skip = true)
  624. proc parseCharData(my: var XmlParser) =
  625. var pos = my.bufpos
  626. while true:
  627. case my.buf[pos]
  628. of '\0', '<', '&': break
  629. of '\c':
  630. # the specification says that CR-LF, CR are to be transformed to LF
  631. pos = lexbase.handleCR(my, pos)
  632. add(my.a, '\L')
  633. of '\L':
  634. pos = lexbase.handleLF(my, pos)
  635. add(my.a, '\L')
  636. of '/':
  637. pos = lexbase.handleRefillChar(my, pos)
  638. add(my.a, '/')
  639. else:
  640. add(my.a, my.buf[pos])
  641. inc(pos)
  642. my.bufpos = pos
  643. my.kind = xmlCharData
  644. proc rawGetTok(my: var XmlParser) =
  645. my.kind = xmlError
  646. setLen(my.a, 0)
  647. var pos = my.bufpos
  648. case my.buf[pos]
  649. of '<':
  650. case my.buf[pos+1]
  651. of '/':
  652. parseEndTag(my)
  653. of '!':
  654. if my.buf[pos+2] == '[' and my.buf[pos+3] == 'C' and
  655. my.buf[pos+4] == 'D' and my.buf[pos+5] == 'A' and
  656. my.buf[pos+6] == 'T' and my.buf[pos+7] == 'A' and
  657. my.buf[pos+8] == '[':
  658. parseCDATA(my)
  659. elif my.buf[pos+2] == '-' and my.buf[pos+3] == '-':
  660. parseComment(my)
  661. else:
  662. parseSpecial(my)
  663. of '?':
  664. parsePI(my)
  665. else:
  666. parseTag(my)
  667. of ' ', '\t', '\c', '\l':
  668. parseWhitespace(my)
  669. my.kind = xmlWhitespace
  670. of '\0':
  671. my.kind = xmlEof
  672. of '&':
  673. parseEntity(my, my.a)
  674. else:
  675. parseCharData(my)
  676. assert my.kind != xmlError
  677. proc getTok(my: var XmlParser) =
  678. while true:
  679. let lastKind = my.kind
  680. rawGetTok(my)
  681. case my.kind
  682. of xmlComment:
  683. if my.options.contains(reportComments): break
  684. of xmlWhitespace:
  685. if my.options.contains(reportWhitespace) or lastKind in {xmlCharData,
  686. xmlComment, xmlEntity}:
  687. break
  688. else: break
  689. proc next*(my: var XmlParser) =
  690. ## retrieves the first/next event. This controls the parser.
  691. case my.state
  692. of stateNormal:
  693. getTok(my)
  694. of stateStart:
  695. my.state = stateNormal
  696. getTok(my)
  697. if my.kind == xmlPI and my.a == "xml":
  698. # just skip the first ``<?xml >`` processing instruction
  699. getTok(my)
  700. of stateAttr:
  701. # parse an attribute key-value pair:
  702. if my.buf[my.bufpos] == '>':
  703. my.kind = xmlElementClose
  704. inc(my.bufpos)
  705. my.state = stateNormal
  706. elif my.buf[my.bufpos] == '/':
  707. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  708. if my.buf[my.bufpos] == '>':
  709. my.kind = xmlElementClose
  710. inc(my.bufpos)
  711. my.state = stateEmptyElementTag
  712. else:
  713. markError(my, errGtExpected)
  714. else:
  715. parseAttribute(my)
  716. # state remains the same
  717. of stateEmptyElementTag:
  718. my.state = stateNormal
  719. my.kind = xmlElementEnd
  720. if not my.cIsEmpty:
  721. my.a = my.c
  722. of stateError:
  723. my.kind = xmlError
  724. my.state = stateNormal
  725. when not defined(testing) and isMainModule:
  726. import std/os
  727. var s = newFileStream(paramStr(1), fmRead)
  728. if s == nil: quit("cannot open the file" & paramStr(1))
  729. var x: XmlParser
  730. open(x, s, paramStr(1))
  731. while true:
  732. next(x)
  733. case x.kind
  734. of xmlError: echo(x.errorMsg())
  735. of xmlEof: break
  736. of xmlCharData: echo(x.charData)
  737. of xmlWhitespace: echo("|$1|" % x.charData)
  738. of xmlComment: echo("<!-- $1 -->" % x.charData)
  739. of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest])
  740. of xmlElementStart: echo("<$1>" % x.elementName)
  741. of xmlElementEnd: echo("</$1>" % x.elementName)
  742. of xmlElementOpen: echo("<$1" % x.elementName)
  743. of xmlAttribute:
  744. echo("Key: " & x.attrKey)
  745. echo("Value: " & x.attrValue)
  746. of xmlElementClose: echo(">")
  747. of xmlCData:
  748. echo("<![CDATA[$1]]>" % x.charData)
  749. of xmlEntity:
  750. echo("&$1;" % x.entityName)
  751. of xmlSpecial:
  752. echo("SPECIAL: " & x.charData)
  753. close(x)