parsexml.nim 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2010 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
  10. ## parser.
  11. ## The only encoding that is supported is UTF-8. The parser has been designed
  12. ## to be somewhat error correcting, so that even most "wild HTML" found on the
  13. ## web can be parsed with it. **Note:** This parser does not check that each
  14. ## ``<tag>`` has a corresponding ``</tag>``! These checks have do be
  15. ## implemented by the client code for various reasons:
  16. ##
  17. ## * Old HTML contains tags that have no end tag: ``<br>`` for example.
  18. ## * HTML tags are case insensitive, XML tags are case sensitive. Since this
  19. ## library can parse both, only the client knows which comparison is to be
  20. ## used.
  21. ## * Thus the checks would have been very difficult to implement properly with
  22. ## little benefit, especially since they are simple to implement in the
  23. ## client. The client should use the `errorMsgExpected` proc to generate
  24. ## a nice error message that fits the other error messages this library
  25. ## creates.
  26. ##
  27. ##
  28. ##[
  29. Example 1: Retrieve HTML title
  30. ==============================
  31. The file ``examples/htmltitle.nim`` demonstrates how to use the
  32. XML parser to accomplish a simple task: To determine the title of an HTML
  33. document.
  34. .. code-block:: nim
  35. # Example program to show the parsexml module
  36. # This program reads an HTML file and writes its title to stdout.
  37. # Errors and whitespace are ignored.
  38. import os, streams, parsexml, strutils
  39. if paramCount() < 1:
  40. quit("Usage: htmltitle filename[.html]")
  41. var filename = addFileExt(paramStr(1), "html")
  42. var s = newFileStream(filename, fmRead)
  43. if s == nil: quit("cannot open the file " & filename)
  44. var x: XmlParser
  45. open(x, s, filename)
  46. while true:
  47. x.next()
  48. case x.kind
  49. of xmlElementStart:
  50. if cmpIgnoreCase(x.elementName, "title") == 0:
  51. var title = ""
  52. x.next() # skip "<title>"
  53. while x.kind == xmlCharData:
  54. title.add(x.charData)
  55. x.next()
  56. if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0:
  57. echo("Title: " & title)
  58. quit(0) # Success!
  59. else:
  60. echo(x.errorMsgExpected("/title"))
  61. of xmlEof: break # end of file reached
  62. else: discard # ignore other events
  63. x.close()
  64. quit("Could not determine title!")
  65. ]##
  66. ##[
  67. Example 2: Retrieve all HTML links
  68. ==================================
  69. The file ``examples/htmlrefs.nim`` demonstrates how to use the
  70. XML parser to accomplish another simple task: To determine all the links
  71. an HTML document contains.
  72. .. code-block:: nim
  73. # Example program to show the new parsexml module
  74. # This program reads an HTML file and writes all its used links to stdout.
  75. # Errors and whitespace are ignored.
  76. import os, streams, parsexml, strutils
  77. proc `=?=` (a, b: string): bool =
  78. # little trick: define our own comparator that ignores case
  79. return cmpIgnoreCase(a, b) == 0
  80. if paramCount() < 1:
  81. quit("Usage: htmlrefs filename[.html]")
  82. var links = 0 # count the number of links
  83. var filename = addFileExt(paramStr(1), "html")
  84. var s = newFileStream(filename, fmRead)
  85. if s == nil: quit("cannot open the file " & filename)
  86. var x: XmlParser
  87. open(x, s, filename)
  88. next(x) # get first event
  89. block mainLoop:
  90. while true:
  91. case x.kind
  92. of xmlElementOpen:
  93. # the <a href = "xyz"> tag we are interested in always has an attribute,
  94. # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
  95. if x.elementName =?= "a":
  96. x.next()
  97. if x.kind == xmlAttribute:
  98. if x.attrKey =?= "href":
  99. var link = x.attrValue
  100. inc(links)
  101. # skip until we have an ``xmlElementClose`` event
  102. while true:
  103. x.next()
  104. case x.kind
  105. of xmlEof: break mainLoop
  106. of xmlElementClose: break
  107. else: discard
  108. x.next() # skip ``xmlElementClose``
  109. # now we have the description for the ``a`` element
  110. var desc = ""
  111. while x.kind == xmlCharData:
  112. desc.add(x.charData)
  113. x.next()
  114. echo(desc & ": " & link)
  115. else:
  116. x.next()
  117. of xmlEof: break # end of file reached
  118. of xmlError:
  119. echo(errorMsg(x))
  120. x.next()
  121. else: x.next() # skip other events
  122. echo($links & " link(s) found!")
  123. x.close()
  124. ]##
  125. import
  126. strutils, lexbase, streams, unicode
  127. when defined(nimPreviewSlimSystem):
  128. import std/assertions
  129. # the parser treats ``<br />`` as ``<br></br>``
  130. # xmlElementCloseEnd, ## ``/>``
  131. type
  132. XmlEventKind* = enum ## enumeration of all events that may occur when parsing
  133. xmlError, ## an error occurred during parsing
  134. xmlEof, ## end of file reached
  135. xmlCharData, ## character data
  136. xmlWhitespace, ## whitespace has been parsed
  137. xmlComment, ## a comment has been parsed
  138. xmlPI, ## processing instruction (``<?name something ?>``)
  139. xmlElementStart, ## ``<elem>``
  140. xmlElementEnd, ## ``</elem>``
  141. xmlElementOpen, ## ``<elem
  142. xmlAttribute, ## ``key = "value"`` pair
  143. xmlElementClose, ## ``>``
  144. xmlCData, ## ``<![CDATA[`` ... data ... ``]]>``
  145. xmlEntity, ## &entity;
  146. xmlSpecial ## ``<! ... data ... >``
  147. XmlErrorKind* = enum ## enumeration that lists all errors that can occur
  148. errNone, ## no error
  149. errEndOfCDataExpected, ## ``]]>`` expected
  150. errNameExpected, ## name expected
  151. errSemicolonExpected, ## ``;`` expected
  152. errQmGtExpected, ## ``?>`` expected
  153. errGtExpected, ## ``>`` expected
  154. errEqExpected, ## ``=`` expected
  155. errQuoteExpected, ## ``"`` or ``'`` expected
  156. errEndOfCommentExpected ## ``-->`` expected
  157. errAttributeValueExpected ## non-empty attribute value expected
  158. ParserState = enum
  159. stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
  160. XmlParseOption* = enum ## options for the XML parser
  161. reportWhitespace, ## report whitespace
  162. reportComments ## report comments
  163. allowUnquotedAttribs ## allow unquoted attribute values (for HTML)
  164. allowEmptyAttribs ## allow empty attributes (without explicit value)
  165. XmlParser* = object of BaseLexer ## the parser object.
  166. a, b, c: string
  167. kind: XmlEventKind
  168. err: XmlErrorKind
  169. state: ParserState
  170. cIsEmpty: bool
  171. filename: string
  172. options: set[XmlParseOption]
  173. const
  174. errorMessages: array[XmlErrorKind, string] = [
  175. "no error",
  176. "']]>' expected",
  177. "name expected",
  178. "';' expected",
  179. "'?>' expected",
  180. "'>' expected",
  181. "'=' expected",
  182. "'\"' or \"'\" expected",
  183. "'-->' expected",
  184. "attribute value expected"
  185. ]
  186. proc open*(my: var XmlParser, input: Stream, filename: string,
  187. options: set[XmlParseOption] = {}) =
  188. ## initializes the parser with an input stream. `Filename` is only used
  189. ## for nice error messages. The parser's behaviour can be controlled by
  190. ## the `options` parameter: If `options` contains ``reportWhitespace``
  191. ## a whitespace token is reported as an ``xmlWhitespace`` event.
  192. ## If `options` contains ``reportComments`` a comment token is reported as an
  193. ## ``xmlComment`` event.
  194. lexbase.open(my, input, 8192, {'\c', '\L', '/'})
  195. my.filename = filename
  196. my.state = stateStart
  197. my.kind = xmlError
  198. my.a = ""
  199. my.b = ""
  200. my.c = ""
  201. my.cIsEmpty = true
  202. my.options = options
  203. proc close*(my: var XmlParser) {.inline.} =
  204. ## closes the parser `my` and its associated input stream.
  205. lexbase.close(my)
  206. proc kind*(my: XmlParser): XmlEventKind {.inline.} =
  207. ## returns the current event type for the XML parser
  208. return my.kind
  209. template charData*(my: XmlParser): string =
  210. ## returns the character data for the events: ``xmlCharData``,
  211. ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
  212. ## Raises an assertion in debug mode if ``my.kind`` is not one
  213. ## of those events. In release mode, this will not trigger an error
  214. ## but the value returned will not be valid.
  215. assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
  216. xmlSpecial})
  217. my.a
  218. template elementName*(my: XmlParser): string =
  219. ## returns the element name for the events: ``xmlElementStart``,
  220. ## ``xmlElementEnd``, ``xmlElementOpen``
  221. ## Raises an assertion in debug mode if ``my.kind`` is not one
  222. ## of those events. In release mode, this will not trigger an error
  223. ## but the value returned will not be valid.
  224. assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
  225. my.a
  226. template entityName*(my: XmlParser): string =
  227. ## returns the entity name for the event: ``xmlEntity``
  228. ## Raises an assertion in debug mode if ``my.kind`` is not
  229. ## ``xmlEntity``. In release mode, this will not trigger an error
  230. ## but the value returned will not be valid.
  231. assert(my.kind == xmlEntity)
  232. my.a
  233. template attrKey*(my: XmlParser): string =
  234. ## returns the attribute key for the event ``xmlAttribute``
  235. ## Raises an assertion in debug mode if ``my.kind`` is not
  236. ## ``xmlAttribute``. In release mode, this will not trigger an error
  237. ## but the value returned will not be valid.
  238. assert(my.kind == xmlAttribute)
  239. my.a
  240. template attrValue*(my: XmlParser): string =
  241. ## returns the attribute value for the event ``xmlAttribute``
  242. ## Raises an assertion in debug mode if ``my.kind`` is not
  243. ## ``xmlAttribute``. In release mode, this will not trigger an error
  244. ## but the value returned will not be valid.
  245. assert(my.kind == xmlAttribute)
  246. my.b
  247. template piName*(my: XmlParser): string =
  248. ## returns the processing instruction name for the event ``xmlPI``
  249. ## Raises an assertion in debug mode if ``my.kind`` is not
  250. ## ``xmlPI``. In release mode, this will not trigger an error
  251. ## but the value returned will not be valid.
  252. assert(my.kind == xmlPI)
  253. my.a
  254. template piRest*(my: XmlParser): string =
  255. ## returns the rest of the processing instruction for the event ``xmlPI``
  256. ## Raises an assertion in debug mode if ``my.kind`` is not
  257. ## ``xmlPI``. In release mode, this will not trigger an error
  258. ## but the value returned will not be valid.
  259. assert(my.kind == xmlPI)
  260. my.b
  261. proc rawData*(my: var XmlParser): lent string {.inline.} =
  262. ## returns the underlying 'data' string by reference.
  263. ## This is only used for speed hacks.
  264. result = my.a
  265. proc rawData2*(my: var XmlParser): lent string {.inline.} =
  266. ## returns the underlying second 'data' string by reference.
  267. ## This is only used for speed hacks.
  268. result = my.b
  269. proc getColumn*(my: XmlParser): int {.inline.} =
  270. ## get the current column the parser has arrived at.
  271. result = getColNumber(my, my.bufpos)
  272. proc getLine*(my: XmlParser): int {.inline.} =
  273. ## get the current line the parser has arrived at.
  274. result = my.lineNumber
  275. proc getFilename*(my: XmlParser): string {.inline.} =
  276. ## get the filename of the file that the parser processes.
  277. result = my.filename
  278. proc errorMsg*(my: XmlParser): string =
  279. ## returns a helpful error message for the event ``xmlError``
  280. assert(my.kind == xmlError)
  281. result = "$1($2, $3) Error: $4" % [
  282. my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
  283. proc errorMsgExpected*(my: XmlParser, tag: string): string =
  284. ## returns an error message "<tag> expected" in the same format as the
  285. ## other error messages
  286. result = "$1($2, $3) Error: $4" % [
  287. my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
  288. proc errorMsg*(my: XmlParser, msg: string): string =
  289. ## returns an error message with text `msg` in the same format as the
  290. ## other error messages
  291. result = "$1($2, $3) Error: $4" % [
  292. my.filename, $getLine(my), $getColumn(my), msg]
  293. proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} =
  294. my.err = kind
  295. my.state = stateError
  296. proc parseCDATA(my: var XmlParser) =
  297. var pos = my.bufpos + len("<![CDATA[")
  298. while true:
  299. case my.buf[pos]
  300. of ']':
  301. if my.buf[pos+1] == ']' and my.buf[pos+2] == '>':
  302. inc(pos, 3)
  303. break
  304. add(my.a, ']')
  305. inc(pos)
  306. of '\0':
  307. markError(my, errEndOfCDataExpected)
  308. break
  309. of '\c':
  310. pos = lexbase.handleCR(my, pos)
  311. add(my.a, '\L')
  312. of '\L':
  313. pos = lexbase.handleLF(my, pos)
  314. add(my.a, '\L')
  315. of '/':
  316. pos = lexbase.handleRefillChar(my, pos)
  317. add(my.a, '/')
  318. else:
  319. add(my.a, my.buf[pos])
  320. inc(pos)
  321. my.bufpos = pos # store back
  322. my.kind = xmlCData
  323. proc parseComment(my: var XmlParser) =
  324. var pos = my.bufpos + len("<!--")
  325. while true:
  326. case my.buf[pos]
  327. of '-':
  328. if my.buf[pos+1] == '-' and my.buf[pos+2] == '>':
  329. inc(pos, 3)
  330. break
  331. if my.options.contains(reportComments): add(my.a, '-')
  332. inc(pos)
  333. of '\0':
  334. markError(my, errEndOfCommentExpected)
  335. break
  336. of '\c':
  337. pos = lexbase.handleCR(my, pos)
  338. if my.options.contains(reportComments): add(my.a, '\L')
  339. of '\L':
  340. pos = lexbase.handleLF(my, pos)
  341. if my.options.contains(reportComments): add(my.a, '\L')
  342. of '/':
  343. pos = lexbase.handleRefillChar(my, pos)
  344. if my.options.contains(reportComments): add(my.a, '/')
  345. else:
  346. if my.options.contains(reportComments): add(my.a, my.buf[pos])
  347. inc(pos)
  348. my.bufpos = pos
  349. my.kind = xmlComment
  350. proc parseWhitespace(my: var XmlParser, skip = false) =
  351. var pos = my.bufpos
  352. while true:
  353. case my.buf[pos]
  354. of ' ', '\t':
  355. if not skip: add(my.a, my.buf[pos])
  356. inc(pos)
  357. of '\c':
  358. # the specification says that CR-LF, CR are to be transformed to LF
  359. pos = lexbase.handleCR(my, pos)
  360. if not skip: add(my.a, '\L')
  361. of '\L':
  362. pos = lexbase.handleLF(my, pos)
  363. if not skip: add(my.a, '\L')
  364. else:
  365. break
  366. my.bufpos = pos
  367. const
  368. NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
  369. NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
  370. proc parseName(my: var XmlParser, dest: var string) =
  371. var pos = my.bufpos
  372. if my.buf[pos] in NameStartChar:
  373. while true:
  374. add(dest, my.buf[pos])
  375. inc(pos)
  376. if my.buf[pos] notin NameChar: break
  377. my.bufpos = pos
  378. else:
  379. markError(my, errNameExpected)
  380. proc parseEntity(my: var XmlParser, dest: var string) =
  381. var pos = my.bufpos+1
  382. my.kind = xmlCharData
  383. if my.buf[pos] == '#':
  384. var r: int
  385. inc(pos)
  386. if my.buf[pos] == 'x':
  387. inc(pos)
  388. while true:
  389. case my.buf[pos]
  390. of '0'..'9': r = (r shl 4) or (ord(my.buf[pos]) - ord('0'))
  391. of 'a'..'f': r = (r shl 4) or (ord(my.buf[pos]) - ord('a') + 10)
  392. of 'A'..'F': r = (r shl 4) or (ord(my.buf[pos]) - ord('A') + 10)
  393. else: break
  394. inc(pos)
  395. else:
  396. while my.buf[pos] in {'0'..'9'}:
  397. r = r * 10 + (ord(my.buf[pos]) - ord('0'))
  398. inc(pos)
  399. add(dest, toUTF8(Rune(r)))
  400. elif my.buf[pos] == 'l' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';':
  401. add(dest, '<')
  402. inc(pos, 2)
  403. elif my.buf[pos] == 'g' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';':
  404. add(dest, '>')
  405. inc(pos, 2)
  406. elif my.buf[pos] == 'a' and my.buf[pos+1] == 'm' and my.buf[pos+2] == 'p' and
  407. my.buf[pos+3] == ';':
  408. add(dest, '&')
  409. inc(pos, 3)
  410. elif my.buf[pos] == 'a' and my.buf[pos+1] == 'p' and my.buf[pos+2] == 'o' and
  411. my.buf[pos+3] == 's' and my.buf[pos+4] == ';':
  412. add(dest, '\'')
  413. inc(pos, 4)
  414. elif my.buf[pos] == 'q' and my.buf[pos+1] == 'u' and my.buf[pos+2] == 'o' and
  415. my.buf[pos+3] == 't' and my.buf[pos+4] == ';':
  416. add(dest, '"')
  417. inc(pos, 4)
  418. else:
  419. my.bufpos = pos
  420. var name = ""
  421. parseName(my, name)
  422. pos = my.bufpos
  423. if my.err != errNameExpected and my.buf[pos] == ';':
  424. my.kind = xmlEntity
  425. else:
  426. add(dest, '&')
  427. add(dest, name)
  428. if my.buf[pos] == ';':
  429. inc(pos)
  430. else:
  431. my.err = errSemicolonExpected
  432. # do not overwrite 'my.state' here, it's a benign error
  433. my.bufpos = pos
  434. proc parsePI(my: var XmlParser) =
  435. inc(my.bufpos, "<?".len)
  436. parseName(my, my.a)
  437. var pos = my.bufpos
  438. setLen(my.b, 0)
  439. while true:
  440. case my.buf[pos]
  441. of '\0':
  442. markError(my, errQmGtExpected)
  443. break
  444. of '?':
  445. if my.buf[pos+1] == '>':
  446. inc(pos, 2)
  447. break
  448. add(my.b, '?')
  449. inc(pos)
  450. of '\c':
  451. # the specification says that CR-LF, CR are to be transformed to LF
  452. pos = lexbase.handleCR(my, pos)
  453. add(my.b, '\L')
  454. of '\L':
  455. pos = lexbase.handleLF(my, pos)
  456. add(my.b, '\L')
  457. of '/':
  458. pos = lexbase.handleRefillChar(my, pos)
  459. add(my.b, '/')
  460. else:
  461. add(my.b, my.buf[pos])
  462. inc(pos)
  463. my.bufpos = pos
  464. my.kind = xmlPI
  465. proc parseSpecial(my: var XmlParser) =
  466. # things that start with <!
  467. var pos = my.bufpos + 2
  468. var opentags = 0
  469. while true:
  470. case my.buf[pos]
  471. of '\0':
  472. markError(my, errGtExpected)
  473. break
  474. of '<':
  475. inc(opentags)
  476. inc(pos)
  477. add(my.a, '<')
  478. of '>':
  479. if opentags <= 0:
  480. inc(pos)
  481. break
  482. dec(opentags)
  483. inc(pos)
  484. add(my.a, '>')
  485. of '\c':
  486. pos = lexbase.handleCR(my, pos)
  487. add(my.a, '\L')
  488. of '\L':
  489. pos = lexbase.handleLF(my, pos)
  490. add(my.a, '\L')
  491. of '/':
  492. pos = lexbase.handleRefillChar(my, pos)
  493. add(my.b, '/')
  494. else:
  495. add(my.a, my.buf[pos])
  496. inc(pos)
  497. my.bufpos = pos
  498. my.kind = xmlSpecial
  499. proc parseTag(my: var XmlParser) =
  500. inc(my.bufpos)
  501. parseName(my, my.a)
  502. # if we have no name, do not interpret the '<':
  503. if my.a.len == 0:
  504. my.kind = xmlCharData
  505. add(my.a, '<')
  506. return
  507. parseWhitespace(my, skip = true)
  508. if my.buf[my.bufpos] in NameStartChar:
  509. # an attribute follows:
  510. my.kind = xmlElementOpen
  511. my.state = stateAttr
  512. my.c = my.a # save for later
  513. my.cIsEmpty = false
  514. else:
  515. my.kind = xmlElementStart
  516. let slash = my.buf[my.bufpos] == '/'
  517. if slash:
  518. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  519. if slash and my.buf[my.bufpos] == '>':
  520. inc(my.bufpos)
  521. my.state = stateEmptyElementTag
  522. my.c = ""
  523. my.cIsEmpty = true
  524. elif my.buf[my.bufpos] == '>':
  525. inc(my.bufpos)
  526. else:
  527. markError(my, errGtExpected)
  528. proc parseEndTag(my: var XmlParser) =
  529. my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1)
  530. #inc(my.bufpos, 2)
  531. parseName(my, my.a)
  532. parseWhitespace(my, skip = true)
  533. if my.buf[my.bufpos] == '>':
  534. inc(my.bufpos)
  535. else:
  536. markError(my, errGtExpected)
  537. my.kind = xmlElementEnd
  538. proc parseAttribute(my: var XmlParser) =
  539. my.kind = xmlAttribute
  540. setLen(my.a, 0)
  541. setLen(my.b, 0)
  542. parseName(my, my.a)
  543. # if we have no name, we have '<tag attr= key %&$$%':
  544. if my.a.len == 0:
  545. markError(my, errGtExpected)
  546. return
  547. let startPos = my.bufpos
  548. parseWhitespace(my, skip = true)
  549. if my.buf[my.bufpos] != '=':
  550. if allowEmptyAttribs notin my.options or
  551. (my.buf[my.bufpos] != '>' and my.bufpos == startPos):
  552. markError(my, errEqExpected)
  553. return
  554. inc(my.bufpos)
  555. parseWhitespace(my, skip = true)
  556. var pos = my.bufpos
  557. if my.buf[pos] in {'\'', '"'}:
  558. var quote = my.buf[pos]
  559. var pendingSpace = false
  560. inc(pos)
  561. while true:
  562. case my.buf[pos]
  563. of '\0':
  564. markError(my, errQuoteExpected)
  565. break
  566. of '&':
  567. if pendingSpace:
  568. add(my.b, ' ')
  569. pendingSpace = false
  570. my.bufpos = pos
  571. parseEntity(my, my.b)
  572. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  573. pos = my.bufpos
  574. of ' ', '\t':
  575. pendingSpace = true
  576. inc(pos)
  577. of '\c':
  578. pos = lexbase.handleCR(my, pos)
  579. pendingSpace = true
  580. of '\L':
  581. pos = lexbase.handleLF(my, pos)
  582. pendingSpace = true
  583. of '/':
  584. pos = lexbase.handleRefillChar(my, pos)
  585. add(my.b, '/')
  586. else:
  587. if my.buf[pos] == quote:
  588. inc(pos)
  589. break
  590. else:
  591. if pendingSpace:
  592. add(my.b, ' ')
  593. pendingSpace = false
  594. add(my.b, my.buf[pos])
  595. inc(pos)
  596. elif allowUnquotedAttribs in my.options:
  597. const disallowedChars = {'"', '\'', '`', '=', '<', '>', ' ',
  598. '\0', '\t', '\L', '\F', '\f'}
  599. let startPos = pos
  600. while (let c = my.buf[pos]; c notin disallowedChars):
  601. if c == '&':
  602. my.bufpos = pos
  603. parseEntity(my, my.b)
  604. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  605. pos = my.bufpos
  606. elif c == '/':
  607. pos = lexbase.handleRefillChar(my, pos)
  608. add(my.b, '/')
  609. else:
  610. add(my.b, c)
  611. inc(pos)
  612. if pos == startPos:
  613. markError(my, errAttributeValueExpected)
  614. else:
  615. markError(my, errQuoteExpected)
  616. # error corrections: guess what was meant
  617. while my.buf[pos] != '>' and my.buf[pos] > ' ':
  618. add(my.b, my.buf[pos])
  619. inc pos
  620. my.bufpos = pos
  621. parseWhitespace(my, skip = true)
  622. proc parseCharData(my: var XmlParser) =
  623. var pos = my.bufpos
  624. while true:
  625. case my.buf[pos]
  626. of '\0', '<', '&': break
  627. of '\c':
  628. # the specification says that CR-LF, CR are to be transformed to LF
  629. pos = lexbase.handleCR(my, pos)
  630. add(my.a, '\L')
  631. of '\L':
  632. pos = lexbase.handleLF(my, pos)
  633. add(my.a, '\L')
  634. of '/':
  635. pos = lexbase.handleRefillChar(my, pos)
  636. add(my.a, '/')
  637. else:
  638. add(my.a, my.buf[pos])
  639. inc(pos)
  640. my.bufpos = pos
  641. my.kind = xmlCharData
  642. proc rawGetTok(my: var XmlParser) =
  643. my.kind = xmlError
  644. setLen(my.a, 0)
  645. var pos = my.bufpos
  646. case my.buf[pos]
  647. of '<':
  648. case my.buf[pos+1]
  649. of '/':
  650. parseEndTag(my)
  651. of '!':
  652. if my.buf[pos+2] == '[' and my.buf[pos+3] == 'C' and
  653. my.buf[pos+4] == 'D' and my.buf[pos+5] == 'A' and
  654. my.buf[pos+6] == 'T' and my.buf[pos+7] == 'A' and
  655. my.buf[pos+8] == '[':
  656. parseCDATA(my)
  657. elif my.buf[pos+2] == '-' and my.buf[pos+3] == '-':
  658. parseComment(my)
  659. else:
  660. parseSpecial(my)
  661. of '?':
  662. parsePI(my)
  663. else:
  664. parseTag(my)
  665. of ' ', '\t', '\c', '\l':
  666. parseWhitespace(my)
  667. my.kind = xmlWhitespace
  668. of '\0':
  669. my.kind = xmlEof
  670. of '&':
  671. parseEntity(my, my.a)
  672. else:
  673. parseCharData(my)
  674. assert my.kind != xmlError
  675. proc getTok(my: var XmlParser) =
  676. while true:
  677. let lastKind = my.kind
  678. rawGetTok(my)
  679. case my.kind
  680. of xmlComment:
  681. if my.options.contains(reportComments): break
  682. of xmlWhitespace:
  683. if my.options.contains(reportWhitespace) or lastKind in {xmlCharData,
  684. xmlComment, xmlEntity}:
  685. break
  686. else: break
  687. proc next*(my: var XmlParser) =
  688. ## retrieves the first/next event. This controls the parser.
  689. case my.state
  690. of stateNormal:
  691. getTok(my)
  692. of stateStart:
  693. my.state = stateNormal
  694. getTok(my)
  695. if my.kind == xmlPI and my.a == "xml":
  696. # just skip the first ``<?xml >`` processing instruction
  697. getTok(my)
  698. of stateAttr:
  699. # parse an attribute key-value pair:
  700. if my.buf[my.bufpos] == '>':
  701. my.kind = xmlElementClose
  702. inc(my.bufpos)
  703. my.state = stateNormal
  704. elif my.buf[my.bufpos] == '/':
  705. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  706. if my.buf[my.bufpos] == '>':
  707. my.kind = xmlElementClose
  708. inc(my.bufpos)
  709. my.state = stateEmptyElementTag
  710. else:
  711. markError(my, errGtExpected)
  712. else:
  713. parseAttribute(my)
  714. # state remains the same
  715. of stateEmptyElementTag:
  716. my.state = stateNormal
  717. my.kind = xmlElementEnd
  718. if not my.cIsEmpty:
  719. my.a = my.c
  720. of stateError:
  721. my.kind = xmlError
  722. my.state = stateNormal
  723. when not defined(testing) and isMainModule:
  724. import os
  725. var s = newFileStream(paramStr(1), fmRead)
  726. if s == nil: quit("cannot open the file" & paramStr(1))
  727. var x: XmlParser
  728. open(x, s, paramStr(1))
  729. while true:
  730. next(x)
  731. case x.kind
  732. of xmlError: echo(x.errorMsg())
  733. of xmlEof: break
  734. of xmlCharData: echo(x.charData)
  735. of xmlWhitespace: echo("|$1|" % x.charData)
  736. of xmlComment: echo("<!-- $1 -->" % x.charData)
  737. of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest])
  738. of xmlElementStart: echo("<$1>" % x.elementName)
  739. of xmlElementEnd: echo("</$1>" % x.elementName)
  740. of xmlElementOpen: echo("<$1" % x.elementName)
  741. of xmlAttribute:
  742. echo("Key: " & x.attrKey)
  743. echo("Value: " & x.attrValue)
  744. of xmlElementClose: echo(">")
  745. of xmlCData:
  746. echo("<![CDATA[$1]]>" % x.charData)
  747. of xmlEntity:
  748. echo("&$1;" % x.entityName)
  749. of xmlSpecial:
  750. echo("SPECIAL: " & x.charData)
  751. close(x)