parsexml.nim 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845
  1. #
  2. #
  3. # Nim's Runtime Library
  4. # (c) Copyright 2010 Andreas Rumpf
  5. #
  6. # See the file "copying.txt", included in this
  7. # distribution, for details about the copyright.
  8. #
  9. ## This module implements a simple high performance `XML`:idx: / `HTML`:idx:
  10. ## parser.
  11. ## The only encoding that is supported is UTF-8. The parser has been designed
  12. ## to be somewhat error correcting, so that even most "wild HTML" found on the
  13. ## web can be parsed with it. **Note:** This parser does not check that each
  14. ## ``<tag>`` has a corresponding ``</tag>``! These checks have do be
  15. ## implemented by the client code for various reasons:
  16. ##
  17. ## * Old HTML contains tags that have no end tag: ``<br>`` for example.
  18. ## * HTML tags are case insensitive, XML tags are case sensitive. Since this
  19. ## library can parse both, only the client knows which comparison is to be
  20. ## used.
  21. ## * Thus the checks would have been very difficult to implement properly with
  22. ## little benefit, especially since they are simple to implement in the
  23. ## client. The client should use the `errorMsgExpected` proc to generate
  24. ## a nice error message that fits the other error messages this library
  25. ## creates.
  26. ##
  27. ##
  28. ##[
  29. Example 1: Retrieve HTML title
  30. ==============================
  31. The file ``examples/htmltitle.nim`` demonstrates how to use the
  32. XML parser to accomplish a simple task: To determine the title of an HTML
  33. document.
  34. .. code-block:: nim
  35. # Example program to show the parsexml module
  36. # This program reads an HTML file and writes its title to stdout.
  37. # Errors and whitespace are ignored.
  38. import os, streams, parsexml, strutils
  39. if paramCount() < 1:
  40. quit("Usage: htmltitle filename[.html]")
  41. var filename = addFileExt(paramStr(1), "html")
  42. var s = newFileStream(filename, fmRead)
  43. if s == nil: quit("cannot open the file " & filename)
  44. var x: XmlParser
  45. open(x, s, filename)
  46. while true:
  47. x.next()
  48. case x.kind
  49. of xmlElementStart:
  50. if cmpIgnoreCase(x.elementName, "title") == 0:
  51. var title = ""
  52. x.next() # skip "<title>"
  53. while x.kind == xmlCharData:
  54. title.add(x.charData)
  55. x.next()
  56. if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0:
  57. echo("Title: " & title)
  58. quit(0) # Success!
  59. else:
  60. echo(x.errorMsgExpected("/title"))
  61. of xmlEof: break # end of file reached
  62. else: discard # ignore other events
  63. x.close()
  64. quit("Could not determine title!")
  65. ]##
  66. ##[
  67. Example 2: Retrieve all HTML links
  68. ==================================
  69. The file ``examples/htmlrefs.nim`` demonstrates how to use the
  70. XML parser to accomplish another simple task: To determine all the links
  71. an HTML document contains.
  72. .. code-block:: nim
  73. # Example program to show the new parsexml module
  74. # This program reads an HTML file and writes all its used links to stdout.
  75. # Errors and whitespace are ignored.
  76. import os, streams, parsexml, strutils
  77. proc `=?=` (a, b: string): bool =
  78. # little trick: define our own comparator that ignores case
  79. return cmpIgnoreCase(a, b) == 0
  80. if paramCount() < 1:
  81. quit("Usage: htmlrefs filename[.html]")
  82. var links = 0 # count the number of links
  83. var filename = addFileExt(paramStr(1), "html")
  84. var s = newFileStream(filename, fmRead)
  85. if s == nil: quit("cannot open the file " & filename)
  86. var x: XmlParser
  87. open(x, s, filename)
  88. next(x) # get first event
  89. block mainLoop:
  90. while true:
  91. case x.kind
  92. of xmlElementOpen:
  93. # the <a href = "xyz"> tag we are interested in always has an attribute,
  94. # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
  95. if x.elementName =?= "a":
  96. x.next()
  97. if x.kind == xmlAttribute:
  98. if x.attrKey =?= "href":
  99. var link = x.attrValue
  100. inc(links)
  101. # skip until we have an ``xmlElementClose`` event
  102. while true:
  103. x.next()
  104. case x.kind
  105. of xmlEof: break mainLoop
  106. of xmlElementClose: break
  107. else: discard
  108. x.next() # skip ``xmlElementClose``
  109. # now we have the description for the ``a`` element
  110. var desc = ""
  111. while x.kind == xmlCharData:
  112. desc.add(x.charData)
  113. x.next()
  114. echo(desc & ": " & link)
  115. else:
  116. x.next()
  117. of xmlEof: break # end of file reached
  118. of xmlError:
  119. echo(errorMsg(x))
  120. x.next()
  121. else: x.next() # skip other events
  122. echo($links & " link(s) found!")
  123. x.close()
  124. ]##
  125. import
  126. hashes, strutils, lexbase, streams, unicode
  127. # the parser treats ``<br />`` as ``<br></br>``
  128. # xmlElementCloseEnd, ## ``/>``
  129. type
  130. XmlEventKind* = enum ## enumation of all events that may occur when parsing
  131. xmlError, ## an error occurred during parsing
  132. xmlEof, ## end of file reached
  133. xmlCharData, ## character data
  134. xmlWhitespace, ## whitespace has been parsed
  135. xmlComment, ## a comment has been parsed
  136. xmlPI, ## processing instruction (``<?name something ?>``)
  137. xmlElementStart, ## ``<elem>``
  138. xmlElementEnd, ## ``</elem>``
  139. xmlElementOpen, ## ``<elem
  140. xmlAttribute, ## ``key = "value"`` pair
  141. xmlElementClose, ## ``>``
  142. xmlCData, ## ``<![CDATA[`` ... data ... ``]]>``
  143. xmlEntity, ## &entity;
  144. xmlSpecial ## ``<! ... data ... >``
  145. XmlErrorKind* = enum ## enumeration that lists all errors that can occur
  146. errNone, ## no error
  147. errEndOfCDataExpected, ## ``]]>`` expected
  148. errNameExpected, ## name expected
  149. errSemicolonExpected, ## ``;`` expected
  150. errQmGtExpected, ## ``?>`` expected
  151. errGtExpected, ## ``>`` expected
  152. errEqExpected, ## ``=`` expected
  153. errQuoteExpected, ## ``"`` or ``'`` expected
  154. errEndOfCommentExpected ## ``-->`` expected
  155. errAttributeValueExpected ## non-empty attribute value expected
  156. ParserState = enum
  157. stateStart, stateNormal, stateAttr, stateEmptyElementTag, stateError
  158. XmlParseOption* = enum ## options for the XML parser
  159. reportWhitespace, ## report whitespace
  160. reportComments ## report comments
  161. allowUnquotedAttribs ## allow unquoted attribute values (for HTML)
  162. allowEmptyAttribs ## allow empty attributes (without explicit value)
  163. XmlParser* = object of BaseLexer ## the parser object.
  164. a, b, c: string
  165. kind: XmlEventKind
  166. err: XmlErrorKind
  167. state: ParserState
  168. cIsEmpty: bool
  169. filename: string
  170. options: set[XmlParseOption]
  171. const
  172. errorMessages: array[XmlErrorKind, string] = [
  173. "no error",
  174. "']]>' expected",
  175. "name expected",
  176. "';' expected",
  177. "'?>' expected",
  178. "'>' expected",
  179. "'=' expected",
  180. "'\"' or \"'\" expected",
  181. "'-->' expected",
  182. "attribute value expected"
  183. ]
  184. proc open*(my: var XmlParser, input: Stream, filename: string,
  185. options: set[XmlParseOption] = {}) =
  186. ## initializes the parser with an input stream. `Filename` is only used
  187. ## for nice error messages. The parser's behaviour can be controlled by
  188. ## the `options` parameter: If `options` contains ``reportWhitespace``
  189. ## a whitespace token is reported as an ``xmlWhitespace`` event.
  190. ## If `options` contains ``reportComments`` a comment token is reported as an
  191. ## ``xmlComment`` event.
  192. lexbase.open(my, input, 8192, {'\c', '\L', '/'})
  193. my.filename = filename
  194. my.state = stateStart
  195. my.kind = xmlError
  196. my.a = ""
  197. my.b = ""
  198. my.c = ""
  199. my.cIsEmpty = true
  200. my.options = options
  201. proc close*(my: var XmlParser) {.inline.} =
  202. ## closes the parser `my` and its associated input stream.
  203. lexbase.close(my)
  204. proc kind*(my: XmlParser): XmlEventKind {.inline.} =
  205. ## returns the current event type for the XML parser
  206. return my.kind
  207. template charData*(my: XmlParser): string =
  208. ## returns the character data for the events: ``xmlCharData``,
  209. ## ``xmlWhitespace``, ``xmlComment``, ``xmlCData``, ``xmlSpecial``
  210. ## Raises an assertion in debug mode if ``my.kind`` is not one
  211. ## of those events. In release mode, this will not trigger an error
  212. ## but the value returned will not be valid.
  213. assert(my.kind in {xmlCharData, xmlWhitespace, xmlComment, xmlCData,
  214. xmlSpecial})
  215. my.a
  216. template elementName*(my: XmlParser): string =
  217. ## returns the element name for the events: ``xmlElementStart``,
  218. ## ``xmlElementEnd``, ``xmlElementOpen``
  219. ## Raises an assertion in debug mode if ``my.kind`` is not one
  220. ## of those events. In release mode, this will not trigger an error
  221. ## but the value returned will not be valid.
  222. assert(my.kind in {xmlElementStart, xmlElementEnd, xmlElementOpen})
  223. my.a
  224. template entityName*(my: XmlParser): string =
  225. ## returns the entity name for the event: ``xmlEntity``
  226. ## Raises an assertion in debug mode if ``my.kind`` is not
  227. ## ``xmlEntity``. In release mode, this will not trigger an error
  228. ## but the value returned will not be valid.
  229. assert(my.kind == xmlEntity)
  230. my.a
  231. template attrKey*(my: XmlParser): string =
  232. ## returns the attribute key for the event ``xmlAttribute``
  233. ## Raises an assertion in debug mode if ``my.kind`` is not
  234. ## ``xmlAttribute``. In release mode, this will not trigger an error
  235. ## but the value returned will not be valid.
  236. assert(my.kind == xmlAttribute)
  237. my.a
  238. template attrValue*(my: XmlParser): string =
  239. ## returns the attribute value for the event ``xmlAttribute``
  240. ## Raises an assertion in debug mode if ``my.kind`` is not
  241. ## ``xmlAttribute``. In release mode, this will not trigger an error
  242. ## but the value returned will not be valid.
  243. assert(my.kind == xmlAttribute)
  244. my.b
  245. template piName*(my: XmlParser): string =
  246. ## returns the processing instruction name for the event ``xmlPI``
  247. ## Raises an assertion in debug mode if ``my.kind`` is not
  248. ## ``xmlPI``. In release mode, this will not trigger an error
  249. ## but the value returned will not be valid.
  250. assert(my.kind == xmlPI)
  251. my.a
  252. template piRest*(my: XmlParser): string =
  253. ## returns the rest of the processing instruction for the event ``xmlPI``
  254. ## Raises an assertion in debug mode if ``my.kind`` is not
  255. ## ``xmlPI``. In release mode, this will not trigger an error
  256. ## but the value returned will not be valid.
  257. assert(my.kind == xmlPI)
  258. my.b
  259. proc rawData*(my: XmlParser): string {.inline.} =
  260. ## returns the underlying 'data' string by reference.
  261. ## This is only used for speed hacks.
  262. shallowCopy(result, my.a)
  263. proc rawData2*(my: XmlParser): string {.inline.} =
  264. ## returns the underlying second 'data' string by reference.
  265. ## This is only used for speed hacks.
  266. shallowCopy(result, my.b)
  267. proc getColumn*(my: XmlParser): int {.inline.} =
  268. ## get the current column the parser has arrived at.
  269. result = getColNumber(my, my.bufpos)
  270. proc getLine*(my: XmlParser): int {.inline.} =
  271. ## get the current line the parser has arrived at.
  272. result = my.lineNumber
  273. proc getFilename*(my: XmlParser): string {.inline.} =
  274. ## get the filename of the file that the parser processes.
  275. result = my.filename
  276. proc errorMsg*(my: XmlParser): string =
  277. ## returns a helpful error message for the event ``xmlError``
  278. assert(my.kind == xmlError)
  279. result = "$1($2, $3) Error: $4" % [
  280. my.filename, $getLine(my), $getColumn(my), errorMessages[my.err]]
  281. proc errorMsgExpected*(my: XmlParser, tag: string): string =
  282. ## returns an error message "<tag> expected" in the same format as the
  283. ## other error messages
  284. result = "$1($2, $3) Error: $4" % [
  285. my.filename, $getLine(my), $getColumn(my), "<$1> expected" % tag]
  286. proc errorMsg*(my: XmlParser, msg: string): string =
  287. ## returns an error message with text `msg` in the same format as the
  288. ## other error messages
  289. result = "$1($2, $3) Error: $4" % [
  290. my.filename, $getLine(my), $getColumn(my), msg]
  291. proc markError(my: var XmlParser, kind: XmlErrorKind) {.inline.} =
  292. my.err = kind
  293. my.state = stateError
  294. proc parseCDATA(my: var XmlParser) =
  295. var pos = my.bufpos + len("<![CDATA[")
  296. var buf = my.buf
  297. while true:
  298. case buf[pos]
  299. of ']':
  300. if buf[pos+1] == ']' and buf[pos+2] == '>':
  301. inc(pos, 3)
  302. break
  303. add(my.a, ']')
  304. inc(pos)
  305. of '\0':
  306. markError(my, errEndOfCDataExpected)
  307. break
  308. of '\c':
  309. pos = lexbase.handleCR(my, pos)
  310. buf = my.buf
  311. add(my.a, '\L')
  312. of '\L':
  313. pos = lexbase.handleLF(my, pos)
  314. buf = my.buf
  315. add(my.a, '\L')
  316. of '/':
  317. pos = lexbase.handleRefillChar(my, pos)
  318. buf = my.buf
  319. add(my.a, '/')
  320. else:
  321. add(my.a, buf[pos])
  322. inc(pos)
  323. my.bufpos = pos # store back
  324. my.kind = xmlCData
  325. proc parseComment(my: var XmlParser) =
  326. var pos = my.bufpos + len("<!--")
  327. var buf = my.buf
  328. while true:
  329. case buf[pos]
  330. of '-':
  331. if buf[pos+1] == '-' and buf[pos+2] == '>':
  332. inc(pos, 3)
  333. break
  334. if my.options.contains(reportComments): add(my.a, '-')
  335. inc(pos)
  336. of '\0':
  337. markError(my, errEndOfCommentExpected)
  338. break
  339. of '\c':
  340. pos = lexbase.handleCR(my, pos)
  341. buf = my.buf
  342. if my.options.contains(reportComments): add(my.a, '\L')
  343. of '\L':
  344. pos = lexbase.handleLF(my, pos)
  345. buf = my.buf
  346. if my.options.contains(reportComments): add(my.a, '\L')
  347. of '/':
  348. pos = lexbase.handleRefillChar(my, pos)
  349. buf = my.buf
  350. if my.options.contains(reportComments): add(my.a, '/')
  351. else:
  352. if my.options.contains(reportComments): add(my.a, buf[pos])
  353. inc(pos)
  354. my.bufpos = pos
  355. my.kind = xmlComment
  356. proc parseWhitespace(my: var XmlParser, skip=false) =
  357. var pos = my.bufpos
  358. var buf = my.buf
  359. while true:
  360. case buf[pos]
  361. of ' ', '\t':
  362. if not skip: add(my.a, buf[pos])
  363. inc(pos)
  364. of '\c':
  365. # the specification says that CR-LF, CR are to be transformed to LF
  366. pos = lexbase.handleCR(my, pos)
  367. buf = my.buf
  368. if not skip: add(my.a, '\L')
  369. of '\L':
  370. pos = lexbase.handleLF(my, pos)
  371. buf = my.buf
  372. if not skip: add(my.a, '\L')
  373. else:
  374. break
  375. my.bufpos = pos
  376. const
  377. NameStartChar = {'A'..'Z', 'a'..'z', '_', ':', '\128'..'\255'}
  378. NameChar = {'A'..'Z', 'a'..'z', '0'..'9', '.', '-', '_', ':', '\128'..'\255'}
  379. proc parseName(my: var XmlParser, dest: var string) =
  380. var pos = my.bufpos
  381. var buf = my.buf
  382. if buf[pos] in NameStartChar:
  383. while true:
  384. add(dest, buf[pos])
  385. inc(pos)
  386. if buf[pos] notin NameChar: break
  387. my.bufpos = pos
  388. else:
  389. markError(my, errNameExpected)
  390. proc parseEntity(my: var XmlParser, dest: var string) =
  391. var pos = my.bufpos+1
  392. var buf = my.buf
  393. my.kind = xmlCharData
  394. if buf[pos] == '#':
  395. var r: int
  396. inc(pos)
  397. if buf[pos] == 'x':
  398. inc(pos)
  399. while true:
  400. case buf[pos]
  401. of '0'..'9': r = (r shl 4) or (ord(buf[pos]) - ord('0'))
  402. of 'a'..'f': r = (r shl 4) or (ord(buf[pos]) - ord('a') + 10)
  403. of 'A'..'F': r = (r shl 4) or (ord(buf[pos]) - ord('A') + 10)
  404. else: break
  405. inc(pos)
  406. else:
  407. while buf[pos] in {'0'..'9'}:
  408. r = r * 10 + (ord(buf[pos]) - ord('0'))
  409. inc(pos)
  410. add(dest, toUTF8(Rune(r)))
  411. elif buf[pos] == 'l' and buf[pos+1] == 't' and buf[pos+2] == ';':
  412. add(dest, '<')
  413. inc(pos, 2)
  414. elif buf[pos] == 'g' and buf[pos+1] == 't' and buf[pos+2] == ';':
  415. add(dest, '>')
  416. inc(pos, 2)
  417. elif buf[pos] == 'a' and buf[pos+1] == 'm' and buf[pos+2] == 'p' and
  418. buf[pos+3] == ';':
  419. add(dest, '&')
  420. inc(pos, 3)
  421. elif buf[pos] == 'a' and buf[pos+1] == 'p' and buf[pos+2] == 'o' and
  422. buf[pos+3] == 's' and buf[pos+4] == ';':
  423. add(dest, '\'')
  424. inc(pos, 4)
  425. elif buf[pos] == 'q' and buf[pos+1] == 'u' and buf[pos+2] == 'o' and
  426. buf[pos+3] == 't' and buf[pos+4] == ';':
  427. add(dest, '"')
  428. inc(pos, 4)
  429. else:
  430. my.bufpos = pos
  431. parseName(my, dest)
  432. pos = my.bufpos
  433. if my.err != errNameExpected:
  434. my.kind = xmlEntity
  435. else:
  436. add(dest, '&')
  437. if buf[pos] == ';':
  438. inc(pos)
  439. else:
  440. markError(my, errSemicolonExpected)
  441. my.bufpos = pos
  442. proc parsePI(my: var XmlParser) =
  443. inc(my.bufpos, "<?".len)
  444. parseName(my, my.a)
  445. var pos = my.bufpos
  446. var buf = my.buf
  447. setLen(my.b, 0)
  448. while true:
  449. case buf[pos]
  450. of '\0':
  451. markError(my, errQmGtExpected)
  452. break
  453. of '?':
  454. if buf[pos+1] == '>':
  455. inc(pos, 2)
  456. break
  457. add(my.b, '?')
  458. inc(pos)
  459. of '\c':
  460. # the specification says that CR-LF, CR are to be transformed to LF
  461. pos = lexbase.handleCR(my, pos)
  462. buf = my.buf
  463. add(my.b, '\L')
  464. of '\L':
  465. pos = lexbase.handleLF(my, pos)
  466. buf = my.buf
  467. add(my.b, '\L')
  468. of '/':
  469. pos = lexbase.handleRefillChar(my, pos)
  470. buf = my.buf
  471. add(my.b, '/')
  472. else:
  473. add(my.b, buf[pos])
  474. inc(pos)
  475. my.bufpos = pos
  476. my.kind = xmlPI
  477. proc parseSpecial(my: var XmlParser) =
  478. # things that start with <!
  479. var pos = my.bufpos + 2
  480. var buf = my.buf
  481. var opentags = 0
  482. while true:
  483. case buf[pos]
  484. of '\0':
  485. markError(my, errGtExpected)
  486. break
  487. of '<':
  488. inc(opentags)
  489. inc(pos)
  490. add(my.a, '<')
  491. of '>':
  492. if opentags <= 0:
  493. inc(pos)
  494. break
  495. dec(opentags)
  496. inc(pos)
  497. add(my.a, '>')
  498. of '\c':
  499. pos = lexbase.handleCR(my, pos)
  500. buf = my.buf
  501. add(my.a, '\L')
  502. of '\L':
  503. pos = lexbase.handleLF(my, pos)
  504. buf = my.buf
  505. add(my.a, '\L')
  506. of '/':
  507. pos = lexbase.handleRefillChar(my, pos)
  508. buf = my.buf
  509. add(my.b, '/')
  510. else:
  511. add(my.a, buf[pos])
  512. inc(pos)
  513. my.bufpos = pos
  514. my.kind = xmlSpecial
  515. proc parseTag(my: var XmlParser) =
  516. inc(my.bufpos)
  517. parseName(my, my.a)
  518. # if we have no name, do not interpret the '<':
  519. if my.a.len == 0:
  520. my.kind = xmlCharData
  521. add(my.a, '<')
  522. return
  523. parseWhitespace(my, skip=true)
  524. if my.buf[my.bufpos] in NameStartChar:
  525. # an attribute follows:
  526. my.kind = xmlElementOpen
  527. my.state = stateAttr
  528. my.c = my.a # save for later
  529. my.cIsEmpty = false
  530. else:
  531. my.kind = xmlElementStart
  532. let slash = my.buf[my.bufpos] == '/'
  533. if slash:
  534. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  535. if slash and my.buf[my.bufpos] == '>':
  536. inc(my.bufpos)
  537. my.state = stateEmptyElementTag
  538. my.c = ""
  539. my.cIsEmpty = true
  540. elif my.buf[my.bufpos] == '>':
  541. inc(my.bufpos)
  542. else:
  543. markError(my, errGtExpected)
  544. proc parseEndTag(my: var XmlParser) =
  545. my.bufpos = lexbase.handleRefillChar(my, my.bufpos+1)
  546. #inc(my.bufpos, 2)
  547. parseName(my, my.a)
  548. parseWhitespace(my, skip=true)
  549. if my.buf[my.bufpos] == '>':
  550. inc(my.bufpos)
  551. else:
  552. markError(my, errGtExpected)
  553. my.kind = xmlElementEnd
  554. proc parseAttribute(my: var XmlParser) =
  555. my.kind = xmlAttribute
  556. setLen(my.a, 0)
  557. setLen(my.b, 0)
  558. parseName(my, my.a)
  559. # if we have no name, we have '<tag attr= key %&$$%':
  560. if my.a.len == 0:
  561. markError(my, errGtExpected)
  562. return
  563. let startPos = my.bufpos
  564. parseWhitespace(my, skip=true)
  565. if my.buf[my.bufpos] != '=':
  566. if allowEmptyAttribs notin my.options or
  567. (my.buf[my.bufpos] != '>' and my.bufpos == startPos):
  568. markError(my, errEqExpected)
  569. return
  570. inc(my.bufpos)
  571. parseWhitespace(my, skip=true)
  572. var pos = my.bufpos
  573. var buf = my.buf
  574. if buf[pos] in {'\'', '"'}:
  575. var quote = buf[pos]
  576. var pendingSpace = false
  577. inc(pos)
  578. while true:
  579. case buf[pos]
  580. of '\0':
  581. markError(my, errQuoteExpected)
  582. break
  583. of '&':
  584. if pendingSpace:
  585. add(my.b, ' ')
  586. pendingSpace = false
  587. my.bufpos = pos
  588. parseEntity(my, my.b)
  589. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  590. pos = my.bufpos
  591. of ' ', '\t':
  592. pendingSpace = true
  593. inc(pos)
  594. of '\c':
  595. pos = lexbase.handleCR(my, pos)
  596. buf = my.buf
  597. pendingSpace = true
  598. of '\L':
  599. pos = lexbase.handleLF(my, pos)
  600. buf = my.buf
  601. pendingSpace = true
  602. of '/':
  603. pos = lexbase.handleRefillChar(my, pos)
  604. buf = my.buf
  605. add(my.b, '/')
  606. else:
  607. if buf[pos] == quote:
  608. inc(pos)
  609. break
  610. else:
  611. if pendingSpace:
  612. add(my.b, ' ')
  613. pendingSpace = false
  614. add(my.b, buf[pos])
  615. inc(pos)
  616. elif allowUnquotedAttribs in my.options:
  617. const disallowedChars = {'"', '\'', '`', '=', '<', '>', ' ',
  618. '\0', '\t', '\L', '\F', '\f'}
  619. let startPos = pos
  620. while (let c = buf[pos]; c notin disallowedChars):
  621. if c == '&':
  622. my.bufpos = pos
  623. parseEntity(my, my.b)
  624. my.kind = xmlAttribute # parseEntity overwrites my.kind!
  625. pos = my.bufpos
  626. else:
  627. add(my.b, c)
  628. inc(pos)
  629. if pos == startPos:
  630. markError(my, errAttributeValueExpected)
  631. else:
  632. markError(my, errQuoteExpected)
  633. # error corrections: guess what was meant
  634. while buf[pos] != '>' and buf[pos] > ' ':
  635. add(my.b, buf[pos])
  636. inc pos
  637. my.bufpos = pos
  638. parseWhitespace(my, skip=true)
  639. proc parseCharData(my: var XmlParser) =
  640. var pos = my.bufpos
  641. var buf = my.buf
  642. while true:
  643. case buf[pos]
  644. of '\0', '<', '&': break
  645. of '\c':
  646. # the specification says that CR-LF, CR are to be transformed to LF
  647. pos = lexbase.handleCR(my, pos)
  648. buf = my.buf
  649. add(my.a, '\L')
  650. of '\L':
  651. pos = lexbase.handleLF(my, pos)
  652. buf = my.buf
  653. add(my.a, '\L')
  654. of '/':
  655. pos = lexbase.handleRefillChar(my, pos)
  656. buf = my.buf
  657. add(my.a, '/')
  658. else:
  659. add(my.a, buf[pos])
  660. inc(pos)
  661. my.bufpos = pos
  662. my.kind = xmlCharData
  663. proc rawGetTok(my: var XmlParser) =
  664. my.kind = xmlError
  665. setLen(my.a, 0)
  666. var pos = my.bufpos
  667. var buf = my.buf
  668. case buf[pos]
  669. of '<':
  670. case buf[pos+1]
  671. of '/':
  672. parseEndTag(my)
  673. of '!':
  674. if buf[pos+2] == '[' and buf[pos+3] == 'C' and buf[pos+4] == 'D' and
  675. buf[pos+5] == 'A' and buf[pos+6] == 'T' and buf[pos+7] == 'A' and
  676. buf[pos+8] == '[':
  677. parseCDATA(my)
  678. elif buf[pos+2] == '-' and buf[pos+3] == '-':
  679. parseComment(my)
  680. else:
  681. parseSpecial(my)
  682. of '?':
  683. parsePI(my)
  684. else:
  685. parseTag(my)
  686. of ' ', '\t', '\c', '\l':
  687. parseWhitespace(my)
  688. my.kind = xmlWhitespace
  689. of '\0':
  690. my.kind = xmlEof
  691. of '&':
  692. parseEntity(my, my.a)
  693. else:
  694. parseCharData(my)
  695. assert my.kind != xmlError
  696. proc getTok(my: var XmlParser) =
  697. while true:
  698. let lastKind = my.kind
  699. rawGetTok(my)
  700. case my.kind
  701. of xmlComment:
  702. if my.options.contains(reportComments): break
  703. of xmlWhitespace:
  704. if my.options.contains(reportWhitespace) or lastKind in {xmlCharData, xmlComment, xmlEntity}:
  705. break
  706. else: break
  707. proc next*(my: var XmlParser) =
  708. ## retrieves the first/next event. This controls the parser.
  709. case my.state
  710. of stateNormal:
  711. getTok(my)
  712. of stateStart:
  713. my.state = stateNormal
  714. getTok(my)
  715. if my.kind == xmlPI and my.a == "xml":
  716. # just skip the first ``<?xml >`` processing instruction
  717. getTok(my)
  718. of stateAttr:
  719. # parse an attribute key-value pair:
  720. if my.buf[my.bufpos] == '>':
  721. my.kind = xmlElementClose
  722. inc(my.bufpos)
  723. my.state = stateNormal
  724. elif my.buf[my.bufpos] == '/':
  725. my.bufpos = lexbase.handleRefillChar(my, my.bufpos)
  726. if my.buf[my.bufpos] == '>':
  727. my.kind = xmlElementClose
  728. inc(my.bufpos)
  729. my.state = stateEmptyElementTag
  730. else:
  731. markError(my, errGtExpected)
  732. else:
  733. parseAttribute(my)
  734. # state remains the same
  735. of stateEmptyElementTag:
  736. my.state = stateNormal
  737. my.kind = xmlElementEnd
  738. if not my.cIsEmpty:
  739. my.a = my.c
  740. of stateError:
  741. my.kind = xmlError
  742. my.state = stateNormal
  743. when not defined(testing) and isMainModule:
  744. import os
  745. var s = newFileStream(paramStr(1), fmRead)
  746. if s == nil: quit("cannot open the file" & paramStr(1))
  747. var x: XmlParser
  748. open(x, s, paramStr(1))
  749. while true:
  750. next(x)
  751. case x.kind
  752. of xmlError: echo(x.errorMsg())
  753. of xmlEof: break
  754. of xmlCharData: echo(x.charData)
  755. of xmlWhitespace: echo("|$1|" % x.charData)
  756. of xmlComment: echo("<!-- $1 -->" % x.charData)
  757. of xmlPI: echo("<? $1 ## $2 ?>" % [x.piName, x.piRest])
  758. of xmlElementStart: echo("<$1>" % x.elementName)
  759. of xmlElementEnd: echo("</$1>" % x.elementName)
  760. of xmlElementOpen: echo("<$1" % x.elementName)
  761. of xmlAttribute:
  762. echo("Key: " & x.attrKey)
  763. echo("Value: " & x.attrValue)
  764. of xmlElementClose: echo(">")
  765. of xmlCData:
  766. echo("<![CDATA[$1]]>" % x.charData)
  767. of xmlEntity:
  768. echo("&$1;" % x.entityName)
  769. of xmlSpecial:
  770. echo("SPECIAL: " & x.charData)
  771. close(x)