parsexml.nim 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2010 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
  10. ## parser.
  11. ## The only encoding that is supported is UTF-8. The parser has been designed
  12. ## to be somewhat error correcting, so that even most "wild HTML" found on the
  13. ## web can be parsed with it. **Note:** This parser does not check that each
  14. ## ``<tag>`` has a corresponding ``</tag>``! These checks have do be
  15. ## implemented by the client code for various reasons:
  16. ##
  17. ## * Old HTML contains tags that have no end tag: ``<br>`` for example.
  18. ## * HTML tags are case insensitive, XML tags are case sensitive. Since this
  19. ## library can parse both, only the client knows which comparison is to be
  20. ## used.
  21. ## * Thus the checks would have been very difficult to implement properly with
  22. ## little benefit, especially since they are simple to implement in the
  23. ## client. The client should use the `errorMsgExpected` proc to generate
  24. ## a nice error message that fits the other error messages this library
  25. ## creates.
  26. ##
  27. ##
  28. ##[
  29. Example 1: Retrieve HTML title
  30. ==============================
  31. The file ``examples/htmltitle.nim`` demonstrates how to use the
  32. XML parser to accomplish a simple task: To determine the title of an HTML
  33. document.
  34. .. code-block:: nim
  35. # Example program to show the parsexml module
  36. # This program reads an HTML file and writes its title to stdout.
  37. # Errors and whitespace are ignored.
  38. import os, streams, parsexml, strutils
  39. if paramCount() < 1:
  40. quit("Usage: htmltitle filename[.html]")
  41. var filename = addFileExt(paramStr(1), "html")
  42. var s = newFileStream(filename, fmRead)
  43. if s == nil: quit("cannot open the file " & filename)
  44. var x: XmlParser
  45. open(x, s, filename)
  46. while true:
  47. x.next()
  48. case x.kind
  49. of xmlElementStart:
  50. if cmpIgnoreCase(x.elementName, "title") == 0:
  51. var title = ""
  52. x.next() # skip "<title>"
  53. while x.kind == xmlCharData:
  54. title.add(x.charData)
  55. x.next()
  56. if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0:
  57. echo("Title: " & title)
  58. quit(0) # Success!
  59. else:
  60. echo(x.errorMsgExpected("/title"))
  61. of xmlEof: break # end of file reached
  62. else: discard # ignore other events
  63. x.close()
  64. quit("Could not determine title!")
  65. ]##
  66. ##[
  67. Example 2: Retrieve all HTML links
  68. ==================================
  69. The file ``examples/htmlrefs.nim`` demonstrates how to use the
  70. XML parser to accomplish another simple task: To determine all the links
  71. an HTML document contains.
  72. .. code-block:: nim
  73. # Example program to show the new parsexml module
  74. # This program reads an HTML file and writes all its used links to stdout.
  75. # Errors and whitespace are ignored.
  76. import os, streams, parsexml, strutils
  77. proc `=?=` (a, b: string): bool =
  78. # little trick: define our own comparator that ignores case
  79. return cmpIgnoreCase(a, b) == 0
  80. if paramCount() < 1:
  81. quit("Usage: htmlrefs filename[.html]")
  82. var links = 0 # count the number of links
  83. var filename = addFileExt(paramStr(1), "html")
  84. var s = newFileStream(filename, fmRead)
  85. if s == nil: quit("cannot open the file " & filename)
  86. var x: XmlParser
  87. open(x, s, filename)
  88. next(x) # get first event
  89. block mainLoop:
  90. while true:
  91. case x.kind
  92. of xmlElementOpen:
  93. # the <a href = "xyz"> tag we are interested in always has an attribute,
  94. # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
  95. if x.elementName =?= "a":
  96. x.next()
  97. if x.kind == xmlAttribute:
  98. if x.attrKey =?= "href":
  99. var link = x.attrValue
  100. inc(links)
  101. # skip until we have an ``xmlElementClose`` event
  102. while true:
  103. x.next()
  104. case x.kind
  105. of xmlEof: break mainLoop
  106. of xmlElementClose: break
  107. else: discard
  108. x.next() # skip ``xmlElementClose``
  109. # now we have the description for the ``a`` element
  110. var desc = ""
  111. while x.kind == xmlCharData:
  112. desc.add(x.charData)
  113. x.next()
  114. echo(desc & ": " & link)
  115. else:
  116. x.next()
  117. of xmlEof: break # end of file reached
  118. of xmlError:
  119. echo(errorMsg(x))
  120. x.next()
  121. else: x.next() # skip other events
  122. echo($links & " link(s) found!")
  123. x.close()
  124. ]##
  125. import
  126. strutils, lexbase, streams, unicode
  127. # the parser treats ``<br />`` as ``<br></br>``
  128. # xmlElementCloseEnd, ## ``/>``
  129. type
  130. XmlEventKind* = enum ## enumeration of all events that may occur when parsing
  131. xmlError, ## an error occurred during parsing
  132. xmlEof, ## end of file reached
  133. xmlCharData, ## character data
  134. xmlWhitespace, ## whitespace has been parsed
  135. xmlComment, ## a comment has been parsed
  136. xmlPI, ## processing instruction (``<?name something ?>``)
  137. xmlElementStart, ## ``<elem>``
  138. xmlElementEnd, ## ``</elem>``
  139. xmlElementOpen, ## ``<elem
  140. xmlAttribute, ## ``key = "value"`` pair
  141. xmlElementClose, ## ``>``
  142. xmlCData, ## ``<![CDATA[`` ... data ... ``]]>``
  143. xmlEntity, ## &entity;
  144. xmlSpecial ## ``<! ... data ... >``
  145. XmlErrorKind* = enum ## enumeration that lists all errors that can occur
  146. errNone, ## no error
  147. errEndOfCDataExpected, ## ``]]>`` expected
  148. errNameExpected, ## name expected
  149. errSemicolonExpected, ## ``;`` expected
  150. errQmGtExpected, ## ``?>`` expected
  151. errGtExpected, ## ``>`` expected
  152. errEqExpected, ## ``=`` expected
  153. errQuoteExpected, ## ``"`` or ``'`` expected
  154. errEndOfCommentExpected ## ``-->`` expected
  155. errAttributeValueExpected ## non-empty attribute value expected
  156. ParserState = enum
  157. stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
  158. XmlParseOption* = enum ## options for the XML parser
  159. reportWhitespace, ## report whitespace
  160. reportComments ## report comments
  161. allowUnquotedAttribs ## allow unquoted attribute values (for HTML)
  162. allowEmptyAttribs ## allow empty attributes (without explicit value)
  163. XmlParser* = object of BaseLexer ## the parser object.
  164. a, b, c: string
  165. kind: XmlEventKind
  166. err: XmlErrorKind
  167. state: ParserState
  168. cIsEmpty: bool
  169. filename: string
  170. options: set[XmlParseOption]
  171. const
  172. errorMessages: array[XmlErrorKind, string] = [
  173. "no error",
  174. "']]>' expected",
  175. "name expected",
  176. "';' expected",
  177. "'?>' expected",
  178. "'>' expected",
  179. "'=' expected",
  180. "'\"' or \"'\" expected",
  181. "'-->' expected",
  182. "attribute value expected"
  183. ]
  184. proc open*(my: var XmlParser, input: Stream, filename: string,
  185. options: set[XmlParseOption] = {}) =
  186. ## initializes the parser with an input stream. `Filename` is only used
  187. ## for nice error messages. The parser's behaviour can be controlled by
  188. ## the `options` parameter: If `options` contains ``reportWhitespace``
  189. ## a whitespace token is reported as an ``xmlWhitespace`` event.
  190. ## If `options` contains ``reportComments`` a comment token is reported as an
  191. ## ``xmlComment`` event.
  192. lexbase.open(my, input, 8192, {'\c', '\L', '/'})
  193. my.filename = filename
  194. my.state = stateStart
  195. my.kind = xmlError
  196. my.a = ""
  197. my.b = ""
  198. my.c = ""
  199. my.cIsEmpty = true
  200. my.options = options
  201. proc close*(my: var XmlParser) {.inline.} =
  202. ## closes the parser `my` and its associated input stream.
  203. lexbase.close(my)
  204. proc kind*(my: XmlParser): XmlEventKind {.inline.} =
  205. ## returns the current event type for the XML parser
  206. return my.kind
  207. template charData*(my: XmlParser): string =
  208. ## returns the character data for the events: ``xmlCharData``,
  209. ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
  210. ## Raises an assertion in debug mode if ``my.kind`` is not one
  211. ## of those events. In release mode, this will not trigger an error
  212. ## but the value returned will not be valid.
  213. assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
  214. xmlSpecial})
  215. my.a
  216. template elementName*(my: XmlParser): string =
  217. ## returns the element name for the events: ``xmlElementStart``,
  218. ## ``xmlElementEnd``, ``xmlElementOpen``
  219. ## Raises an assertion in debug mode if ``my.kind`` is not one
  220. ## of those events. In release mode, this will not trigger an error
  221. ## but the value returned will not be valid.
  222. assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
  223. my.a
  224. template entityName*(my: XmlParser): string =
  225. ## returns the entity name for the event: ``xmlEntity``
  226. ## Raises an assertion in debug mode if ``my.kind`` is not
  227. ## ``xmlEntity``. In release mode, this will not trigger an error
  228. ## but the value returned will not be valid.
  229. assert(my.kind == xmlEntity)
  230. my.a
  231. template attrKey*(my: XmlParser): string =
  232. ## returns the attribute key for the event ``xmlAttribute``
  233. ## Raises an assertion in debug mode if ``my.kind`` is not
  234. ## ``xmlAttribute``. In release mode, this will not trigger an error
  235. ## but the value returned will not be valid.
  236. assert(my.kind == xmlAttribute)
  237. my.a
  238. template attrValue*(my: XmlParser): string =
  239. ## returns the attribute value for the event ``xmlAttribute``
  240. ## Raises an assertion in debug mode if ``my.kind`` is not
  241. ## ``xmlAttribute``. In release mode, this will not trigger an error
  242. ## but the value returned will not be valid.
  243. assert(my.kind == xmlAttribute)
  244. my.b
  245. template piName*(my: XmlParser): string =
  246. ## returns the processing instruction name for the event ``xmlPI``
  247. ## Raises an assertion in debug mode if ``my.kind`` is not
  248. ## ``xmlPI``. In release mode, this will not trigger an error
  249. ## but the value returned will not be valid.
  250. assert(my.kind == xmlPI)
  251. my.a
  252. template piRest*(my: XmlParser): string =
  253. ## returns the rest of the processing instruction for the event ``xmlPI``
  254. ## Raises an assertion in debug mode if ``my.kind`` is not
  255. ## ``xmlPI``. In release mode, this will not trigger an error
  256. ## but the value returned will not be valid.
  257. assert(my.kind == xmlPI)
  258. my.b
  259. proc rawData*(my: XmlParser): string {.inline.} =
  260. ## returns the underlying 'data' string by reference.
  261. ## This is only used for speed hacks.
  262. shallowCopy(result, my.a)
  263. proc rawData2*(my: XmlParser): string {.inline.} =
  264. ## returns the underlying second 'data' string by reference.
  265. ## This is only used for speed hacks.
  266. shallowCopy(result, my.b)
  267. proc getColumn*(my: XmlParser): int {.inline.} =
  268. ## get the current column the parser has arrived at.
  269. result = getColNumber(my, my.bufpos)
  270. proc getLine*(my: XmlParser): int {.inline.} =
  271. ## get the current line the parser has arrived at.
  272. result = my.lineNumber
  273. proc getFilename*(my: XmlParser): string {.inline.} =
  274. ## get the filename of the file that the parser processes.
  275. result = my.filename
  276. proc errorMsg*(my: XmlParser): string =
  277. ## returns a helpful error message for the event ``xmlError``
  278. assert(my.kind == xmlError)
  279. result = "$1($2, $3) Error: $4" % [
  280. my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
  281. proc errorMsgExpected*(my: XmlParser, tag: string): string =
  282. ## returns an error message "<tag> expected" in the same format as the
  283. ## other error messages
  284. result = "$1($2, $3) Error: $4" % [
  285. my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
  286. proc errorMsg*(my: XmlParser, msg: string): string =
  287. ## returns an error message with text `msg` in the same format as the
  288. ## other error messages
  289. result = "$1($2, $3) Error: $4" % [
  290. my.filename, $getLine(my), $getColumn(my), msg]
  291. proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} =
  292. my.err = kind
  293. my.state = stateError
  294. proc parseCDATA(my: var XmlParser) =
  295. var pos = my.bufpos + len("<![CDATA[")
  296. while true:
  297. case my.buf[pos]
  298. of ']':
  299. if my.buf[pos+1] == ']' and my.buf[pos+2] == '>':
  300. inc(pos, 3)
  301. break
  302. add(my.a, ']')
  303. inc(pos)
  304. of '\0':
  305. markError(my, errEndOfCDataExpected)
  306. break
  307. of '\c':
  308. pos = lexbase.handleCR(my, pos)
  309. add(my.a, '\L')
  310. of '\L':
  311. pos = lexbase.handleLF(my, pos)
  312. add(my.a, '\L')
  313. of '/':
  314. pos = lexbase.handleRefillChar(my, pos)
  315. add(my.a, '/')
  316. else:
  317. add(my.a, my.buf[pos])
  318. inc(pos)
  319. my.bufpos = pos # store back
  320. my.kind = xmlCData
  321. proc parseComment(my: var XmlParser) =
  322. var pos = my.bufpos + len("<!--")
  323. while true:
  324. case my.buf[pos]
  325. of '-':
  326. if my.buf[pos+1] == '-' and my.buf[pos+2] == '>':
  327. inc(pos, 3)
  328. break
  329. if my.options.contains(reportComments): add(my.a, '-')
  330. inc(pos)
  331. of '\0':
  332. markError(my, errEndOfCommentExpected)
  333. break
  334. of '\c':
  335. pos = lexbase.handleCR(my, pos)
  336. if my.options.contains(reportComments): add(my.a, '\L')
  337. of '\L':
  338. pos = lexbase.handleLF(my, pos)
  339. if my.options.contains(reportComments): add(my.a, '\L')
  340. of '/':
  341. pos = lexbase.handleRefillChar(my, pos)
  342. if my.options.contains(reportComments): add(my.a, '/')
  343. else:
  344. if my.options.contains(reportComments): add(my.a, my.buf[pos])
  345. inc(pos)
  346. my.bufpos = pos
  347. my.kind = xmlComment
  348. proc parseWhitespace(my: var XmlParser, skip = false) =
  349. var pos = my.bufpos
  350. while true:
  351. case my.buf[pos]
  352. of ' ', '\t':
  353. if not skip: add(my.a, my.buf[pos])
  354. inc(pos)
  355. of '\c':
  356. # the specification says that CR-LF, CR are to be transformed to LF
  357. pos = lexbase.handleCR(my, pos)
  358. if not skip: add(my.a, '\L')
  359. of '\L':
  360. pos = lexbase.handleLF(my, pos)
  361. if not skip: add(my.a, '\L')
  362. else:
  363. break
  364. my.bufpos = pos
  365. const
  366. NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
  367. NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
  368. proc parseName(my: var XmlParser, dest: var string) =
  369. var pos = my.bufpos
  370. if my.buf[pos] in NameStartChar:
  371. while true:
  372. add(dest, my.buf[pos])
  373. inc(pos)
  374. if my.buf[pos] notin NameChar: break
  375. my.bufpos = pos
  376. else:
  377. markError(my, errNameExpected)
  378. proc parseEntity(my: var XmlParser, dest: var string) =
  379. var pos = my.bufpos+1
  380. my.kind = xmlCharData
  381. if my.buf[pos] == '#':
  382. var r: int
  383. inc(pos)
  384. if my.buf[pos] == 'x':
  385. inc(pos)
  386. while true:
  387. case my.buf[pos]
  388. of '0'..'9': r = (r shl 4) or (ord(my.buf[pos]) - ord('0'))
  389. of 'a'..'f': r = (r shl 4) or (ord(my.buf[pos]) - ord('a') + 10)
  390. of 'A'..'F': r = (r shl 4) or (ord(my.buf[pos]) - ord('A') + 10)
  391. else: break
  392. inc(pos)
  393. else:
  394. while my.buf[pos] in {'0'..'9'}:
  395. r = r * 10 + (ord(my.buf[pos]) - ord('0'))
  396. inc(pos)
  397. add(dest, toUTF8(Rune(r)))
  398. elif my.buf[pos] == 'l' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';':
  399. add(dest, '<')
  400. inc(pos, 2)
  401. elif my.buf[pos] == 'g' and my.buf[pos+1] == 't' and my.buf[pos+2] == ';':
  402. add(dest, '>')
  403. inc(pos, 2)
  404. elif my.buf[pos] == 'a' and my.buf[pos+1] == 'm' and my.buf[pos+2] == 'p' and
  405. my.buf[pos+3] == ';':
  406. add(dest, '&')
  407. inc(pos, 3)
  408. elif my.buf[pos] == 'a' and my.buf[pos+1] == 'p' and my.buf[pos+2] == 'o' and
  409. my.buf[pos+3] == 's' and my.buf[pos+4] == ';':
  410. add(dest, '\'')
  411. inc(pos, 4)
  412. elif my.buf[pos] == 'q' and my.buf[pos+1] == 'u' and my.buf[pos+2] == 'o' and
  413. my.buf[pos+3] == 't' and my.buf[pos+4] == ';':
  414. add(dest, '"')
  415. inc(pos, 4)
  416. else:
  417. my.bufpos = pos
  418. var name = ""
  419. parseName(my, name)
  420. pos = my.bufpos
  421. if my.err != errNameExpected and my.buf[pos] == ';':
  422. my.kind = xmlEntity
  423. else:
  424. add(dest, '&')
  425. add(dest, name)
  426. if my.buf[pos] == ';':
  427. inc(pos)
  428. else:
  429. my.err = errSemicolonExpected
  430. # do not overwrite 'my.state' here, it's a benign error
  431. my.bufpos = pos
  432. proc parsePI(my: var XmlParser) =
  433. inc(my.bufpos, "<?".len)
  434. parseName(my, my.a)
  435. var pos = my.bufpos
  436. setLen(my.b, 0)
  437. while true:
  438. case my.buf[pos]
  439. of '\0':
  440. markError(my, errQmGtExpected)
  441. break
  442. of '?':
  443. if my.buf[pos+1] == '>':
  444. inc(pos, 2)
  445. break
  446. add(my.b, '?')
  447. inc(pos)
  448. of '\c':
  449. # the specification says that CR-LF, CR are to be transformed to LF
  450. pos = lexbase.handleCR(my, pos)
  451. add(my.b, '\L')
  452. of '\L':
  453. pos = lexbase.handleLF(my, pos)
  454. add(my.b, '\L')
  455. of '/':
  456. pos = lexbase.handleRefillChar(my, pos)
  457. add(my.b, '/')
  458. else:
  459. add(my.b, my.buf[pos])
  460. inc(pos)
  461. my.bufpos = pos
  462. my.kind = xmlPI
  463. proc parseSpecial(my: var XmlParser) =
  464. # things that start with <!
  465. var pos = my.bufpos + 2
  466. var opentags = 0
  467. while true:
  468. case my.buf[pos]
  469. of '\0':
  470. markError(my, errGtExpected)
  471. break
  472. of '<':
  473. inc(opentags)
  474. inc(pos)
  475. add(my.a, '<')
  476. of '>':
  477. if opentags <= 0:
  478. inc(pos)
  479. break
  480. dec(opentags)
  481. inc(pos)
  482. add(my.a, '>')
  483. of '\c':
  484. pos = lexbase.handleCR(my, pos)
  485. add(my.a, '\L')
  486. of '\L':
  487. pos = lexbase.handleLF(my, pos)
  488. add(my.a, '\L')
  489. of '/':
  490. pos = lexbase.handleRefillChar(my, pos)
  491. add(my.b, '/')
  492. else:
  493. add(my.a, my.buf[pos])
  494. inc(pos)
  495. my.bufpos = pos
  496. my.kind = xmlSpecial
  497. proc parseTag(my: var XmlParser) =
  498. inc(my.bufpos)
  499. parseName(my, my.a)
  500. # if we have no name, do not interpret the '<':
  501. if my.a.len == 0:
  502. my.kind = xmlCharData
  503. add(my.a, '<')
  504. return
  505. parseWhitespace(my, skip = true)
  506. if my.buf[my.bufpos] in NameStartChar:
  507. # an attribute follows:
  508. my.kind = xmlElementOpen
  509. my.state = stateAttr
  510. my.c = my.a # save for later
  511. my.cIsEmpty = false
  512. else:
  513. my.kind = xmlElementStart
  514. let slash = my.buf[my.bufpos] == '/'
  515. if slash:
  516. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  517. if slash and my.buf[my.bufpos] == '>':
  518. inc(my.bufpos)
  519. my.state = stateEmptyElementTag
  520. my.c = ""
  521. my.cIsEmpty = true
  522. elif my.buf[my.bufpos] == '>':
  523. inc(my.bufpos)
  524. else:
  525. markError(my, errGtExpected)
  526. proc parseEndTag(my: var XmlParser) =
  527. my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1)
  528. #inc(my.bufpos, 2)
  529. parseName(my, my.a)
  530. parseWhitespace(my, skip = true)
  531. if my.buf[my.bufpos] == '>':
  532. inc(my.bufpos)
  533. else:
  534. markError(my, errGtExpected)
  535. my.kind = xmlElementEnd
  536. proc parseAttribute(my: var XmlParser) =
  537. my.kind = xmlAttribute
  538. setLen(my.a, 0)
  539. setLen(my.b, 0)
  540. parseName(my, my.a)
  541. # if we have no name, we have '<tag attr= key %&$$%':
  542. if my.a.len == 0:
  543. markError(my, errGtExpected)
  544. return
  545. let startPos = my.bufpos
  546. parseWhitespace(my, skip = true)
  547. if my.buf[my.bufpos] != '=':
  548. if allowEmptyAttribs notin my.options or
  549. (my.buf[my.bufpos] != '>' and my.bufpos == startPos):
  550. markError(my, errEqExpected)
  551. return
  552. inc(my.bufpos)
  553. parseWhitespace(my, skip = true)
  554. var pos = my.bufpos
  555. if my.buf[pos] in {'\'', '"'}:
  556. var quote = my.buf[pos]
  557. var pendingSpace = false
  558. inc(pos)
  559. while true:
  560. case my.buf[pos]
  561. of '\0':
  562. markError(my, errQuoteExpected)
  563. break
  564. of '&':
  565. if pendingSpace:
  566. add(my.b, ' ')
  567. pendingSpace = false
  568. my.bufpos = pos
  569. parseEntity(my, my.b)
  570. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  571. pos = my.bufpos
  572. of ' ', '\t':
  573. pendingSpace = true
  574. inc(pos)
  575. of '\c':
  576. pos = lexbase.handleCR(my, pos)
  577. pendingSpace = true
  578. of '\L':
  579. pos = lexbase.handleLF(my, pos)
  580. pendingSpace = true
  581. of '/':
  582. pos = lexbase.handleRefillChar(my, pos)
  583. add(my.b, '/')
  584. else:
  585. if my.buf[pos] == quote:
  586. inc(pos)
  587. break
  588. else:
  589. if pendingSpace:
  590. add(my.b, ' ')
  591. pendingSpace = false
  592. add(my.b, my.buf[pos])
  593. inc(pos)
  594. elif allowUnquotedAttribs in my.options:
  595. const disallowedChars = {'"', '\'', '`', '=', '<', '>', ' ',
  596. '\0', '\t', '\L', '\F', '\f'}
  597. let startPos = pos
  598. while (let c = my.buf[pos]; c notin disallowedChars):
  599. if c == '&':
  600. my.bufpos = pos
  601. parseEntity(my, my.b)
  602. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  603. pos = my.bufpos
  604. elif c == '/':
  605. pos = lexbase.handleRefillChar(my, pos)
  606. add(my.b, '/')
  607. else:
  608. add(my.b, c)
  609. inc(pos)
  610. if pos == startPos:
  611. markError(my, errAttributeValueExpected)
  612. else:
  613. markError(my, errQuoteExpected)
  614. # error corrections: guess what was meant
  615. while my.buf[pos] != '>' and my.buf[pos] > ' ':
  616. add(my.b, my.buf[pos])
  617. inc pos
  618. my.bufpos = pos
  619. parseWhitespace(my, skip = true)
  620. proc parseCharData(my: var XmlParser) =
  621. var pos = my.bufpos
  622. while true:
  623. case my.buf[pos]
  624. of '\0', '<', '&': break
  625. of '\c':
  626. # the specification says that CR-LF, CR are to be transformed to LF
  627. pos = lexbase.handleCR(my, pos)
  628. add(my.a, '\L')
  629. of '\L':
  630. pos = lexbase.handleLF(my, pos)
  631. add(my.a, '\L')
  632. of '/':
  633. pos = lexbase.handleRefillChar(my, pos)
  634. add(my.a, '/')
  635. else:
  636. add(my.a, my.buf[pos])
  637. inc(pos)
  638. my.bufpos = pos
  639. my.kind = xmlCharData
  640. proc rawGetTok(my: var XmlParser) =
  641. my.kind = xmlError
  642. setLen(my.a, 0)
  643. var pos = my.bufpos
  644. case my.buf[pos]
  645. of '<':
  646. case my.buf[pos+1]
  647. of '/':
  648. parseEndTag(my)
  649. of '!':
  650. if my.buf[pos+2] == '[' and my.buf[pos+3] == 'C' and
  651. my.buf[pos+4] == 'D' and my.buf[pos+5] == 'A' and
  652. my.buf[pos+6] == 'T' and my.buf[pos+7] == 'A' and
  653. my.buf[pos+8] == '[':
  654. parseCDATA(my)
  655. elif my.buf[pos+2] == '-' and my.buf[pos+3] == '-':
  656. parseComment(my)
  657. else:
  658. parseSpecial(my)
  659. of '?':
  660. parsePI(my)
  661. else:
  662. parseTag(my)
  663. of ' ', '\t', '\c', '\l':
  664. parseWhitespace(my)
  665. my.kind = xmlWhitespace
  666. of '\0':
  667. my.kind = xmlEof
  668. of '&':
  669. parseEntity(my, my.a)
  670. else:
  671. parseCharData(my)
  672. assert my.kind != xmlError
  673. proc getTok(my: var XmlParser) =
  674. while true:
  675. let lastKind = my.kind
  676. rawGetTok(my)
  677. case my.kind
  678. of xmlComment:
  679. if my.options.contains(reportComments): break
  680. of xmlWhitespace:
  681. if my.options.contains(reportWhitespace) or lastKind in {xmlCharData,
  682. xmlComment, xmlEntity}:
  683. break
  684. else: break
  685. proc next*(my: var XmlParser) =
  686. ## retrieves the first/next event. This controls the parser.
  687. case my.state
  688. of stateNormal:
  689. getTok(my)
  690. of stateStart:
  691. my.state = stateNormal
  692. getTok(my)
  693. if my.kind == xmlPI and my.a == "xml":
  694. # just skip the first ``<?xml >`` processing instruction
  695. getTok(my)
  696. of stateAttr:
  697. # parse an attribute key-value pair:
  698. if my.buf[my.bufpos] == '>':
  699. my.kind = xmlElementClose
  700. inc(my.bufpos)
  701. my.state = stateNormal
  702. elif my.buf[my.bufpos] == '/':
  703. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  704. if my.buf[my.bufpos] == '>':
  705. my.kind = xmlElementClose
  706. inc(my.bufpos)
  707. my.state = stateEmptyElementTag
  708. else:
  709. markError(my, errGtExpected)
  710. else:
  711. parseAttribute(my)
  712. # state remains the same
  713. of stateEmptyElementTag:
  714. my.state = stateNormal
  715. my.kind = xmlElementEnd
  716. if not my.cIsEmpty:
  717. my.a = my.c
  718. of stateError:
  719. my.kind = xmlError
  720. my.state = stateNormal
  721. when not defined(testing) and isMainModule:
  722. import os
  723. var s = newFileStream(paramStr(1), fmRead)
  724. if s == nil: quit("cannot open the file" & paramStr(1))
  725. var x: XmlParser
  726. open(x, s, paramStr(1))
  727. while true:
  728. next(x)
  729. case x.kind
  730. of xmlError: echo(x.errorMsg())
  731. of xmlEof: break
  732. of xmlCharData: echo(x.charData)
  733. of xmlWhitespace: echo("|$1|" % x.charData)
  734. of xmlComment: echo("<!-- $1 -->" % x.charData)
  735. of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest])
  736. of xmlElementStart: echo("<$1>" % x.elementName)
  737. of xmlElementEnd: echo("</$1>" % x.elementName)
  738. of xmlElementOpen: echo("<$1" % x.elementName)
  739. of xmlAttribute:
  740. echo("Key: " & x.attrKey)
  741. echo("Value: " & x.attrValue)
  742. of xmlElementClose: echo(">")
  743. of xmlCData:
  744. echo("<![CDATA[$1]]>" % x.charData)
  745. of xmlEntity:
  746. echo("&$1;" % x.entityName)
  747. of xmlSpecial:
  748. echo("SPECIAL: " & x.charData)
  749. close(x)