thtmlparser.nim 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. discard """
  2. matrix: "--mm:refc; --mm:orc"
  3. targets: "c js"
  4. output: '''
  5. true
  6. https://example.com/test?format=jpg&name=orig##
  7. https://example.com/test?format=jpg&name=orig##text
  8. https://example.com/test?format=jpg##text
  9. '''
  10. """
  11. import htmlparser
  12. import xmltree
  13. import strutils
  14. from streams import newStringStream
  15. import std/assertions
  16. block t2813:
  17. const
  18. html = """
  19. <html>
  20. <head>
  21. <title>Test</title>
  22. </head>
  23. <body>
  24. <table>
  25. <thead>
  26. <tr><td>A</td></tr>
  27. <tr><td>B</td></tr>
  28. </thead>
  29. <tbody>
  30. <tr><td></td>A<td></td></tr>
  31. <tr><td></td>B<td></td></tr>
  32. <tr><td></td>C<td></td></tr>
  33. </tbody>
  34. <tfoot>
  35. <tr><td>A</td></tr>
  36. </tfoot>
  37. </table>
  38. </body>
  39. </html>
  40. """
  41. var errors: seq[string] = @[]
  42. let tree = parseHtml(newStringStream(html), "test.html", errors)
  43. doAssert errors.len == 0 # Errors: </thead> expected,...
  44. var len = tree.findAll("tr").len # len = 6
  45. var rows: seq[XmlNode] = @[]
  46. for n in tree.findAll("table"):
  47. n.findAll("tr", rows) # len = 2
  48. break
  49. doAssert tree.findAll("tr").len == rows.len
  50. block t2814:
  51. ## builds the two cases below and test that
  52. ## ``//[dd,li]`` has "<p>that</p>" as children
  53. ##
  54. ## <dl>
  55. ## <dt>this</dt>
  56. ## <dd>
  57. ## <p>that</p>
  58. ## </dd>
  59. ## </dl>
  60. ##
  61. ## <ul>
  62. ## <li>
  63. ## <p>that</p>
  64. ## </li>
  65. ## </ul>
  66. for ltype in [["dl","dd"], ["ul","li"]]:
  67. let desc_item = if ltype[0]=="dl": "<dt>this</dt>" else: ""
  68. let item = "$1<$2><p>that</p></$2>" % [desc_item, ltype[1]]
  69. let list = """ <$1>
  70. $2
  71. </$1> """ % [ltype[0], item]
  72. var errors : seq[string] = @[]
  73. let parseH = parseHtml(newStringStream(list),"statichtml", errors =errors)
  74. if $parseH.findAll(ltype[1])[0].child("p") != "<p>that</p>":
  75. echo "case " & ltype[0] & " failed !"
  76. quit(2)
  77. echo "true"
  78. block t6154:
  79. let foo = """
  80. <!DOCTYPE html>
  81. <html>
  82. <head>
  83. <title> foobar </title>
  84. </head>
  85. <body>
  86. <p class=foo id=bar></p>
  87. <p something=&#9;foo&#9;bar&#178;></p>
  88. <p something= &#9;foo&#9;bar&#178; foo =bloo></p>
  89. <p class="foo2" id="bar2"></p>
  90. <p wrong= ></p>
  91. <p data-foo data-bar="correct!" enabled ></p>
  92. <p quux whatever></p>
  93. </body>
  94. </html>
  95. """
  96. var errors: seq[string] = @[]
  97. let html = parseHtml(newStringStream(foo), "statichtml", errors=errors)
  98. doAssert "statichtml(11, 18) Error: attribute value expected" in errors
  99. let ps = html.findAll("p")
  100. doAssert ps.len == 7
  101. doAssert ps[0].attrsLen == 2
  102. doAssert ps[0].attr("class") == "foo"
  103. doAssert ps[0].attr("id") == "bar"
  104. doAssert ps[0].len == 0
  105. doAssert ps[1].attrsLen == 1
  106. doAssert ps[1].attr("something") == "\tfoo\tbar²"
  107. doAssert ps[1].len == 0
  108. doAssert ps[2].attrsLen == 2
  109. doAssert ps[2].attr("something") == "\tfoo\tbar²"
  110. doAssert ps[2].attr("foo") == "bloo"
  111. doAssert ps[2].len == 0
  112. doAssert ps[3].attrsLen == 2
  113. doAssert ps[3].attr("class") == "foo2"
  114. doAssert ps[3].attr("id") == "bar2"
  115. doAssert ps[3].len == 0
  116. doAssert ps[4].attrsLen == 1
  117. doAssert ps[4].attr("wrong") == ""
  118. doAssert ps[5].attrsLen == 3
  119. doAssert ps[5].attr("data-foo") == ""
  120. doAssert ps[5].attr("data-bar") == "correct!"
  121. doAssert ps[5].attr("enabled") == ""
  122. doAssert ps[5].len == 0
  123. doAssert ps[6].attrsLen == 2
  124. doAssert ps[6].attr("quux") == ""
  125. doAssert ps[6].attr("whatever") == ""
  126. doAssert ps[6].len == 0
  127. # bug #11713, #1034
  128. var content = """
  129. # with &
  130. <img src="https://example.com/test?format=jpg&name=orig" alt="">
  131. <img src="https://example.com/test?format=jpg&name=orig" alt="text">
  132. # without &
  133. <img src="https://example.com/test?format=jpg" alt="text">
  134. """
  135. var
  136. stream = newStringStream(content)
  137. body = parseHtml(stream)
  138. for y in body.findAll("img"):
  139. echo y.attr("src"), "##", y.attr("alt")