thtmlparser.nim 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. discard """
  2. targets: "c js"
  3. output: '''
  4. true
  5. https://example.com/test?format=jpg&name=orig##
  6. https://example.com/test?format=jpg&name=orig##text
  7. https://example.com/test?format=jpg##text
  8. '''
  9. """
  10. import htmlparser
  11. import xmltree
  12. import strutils
  13. from streams import newStringStream
  14. block t2813:
  15. const
  16. html = """
  17. <html>
  18. <head>
  19. <title>Test</title>
  20. </head>
  21. <body>
  22. <table>
  23. <thead>
  24. <tr><td>A</td></tr>
  25. <tr><td>B</td></tr>
  26. </thead>
  27. <tbody>
  28. <tr><td></td>A<td></td></tr>
  29. <tr><td></td>B<td></td></tr>
  30. <tr><td></td>C<td></td></tr>
  31. </tbody>
  32. <tfoot>
  33. <tr><td>A</td></tr>
  34. </tfoot>
  35. </table>
  36. </body>
  37. </html>
  38. """
  39. var errors: seq[string] = @[]
  40. let tree = parseHtml(newStringStream(html), "test.html", errors)
  41. doAssert errors.len == 0 # Errors: </thead> expected,...
  42. var len = tree.findAll("tr").len # len = 6
  43. var rows: seq[XmlNode] = @[]
  44. for n in tree.findAll("table"):
  45. n.findAll("tr", rows) # len = 2
  46. break
  47. doAssert tree.findAll("tr").len == rows.len
  48. block t2814:
  49. ## builds the two cases below and test that
  50. ## ``//[dd,li]`` has "<p>that</p>" as children
  51. ##
  52. ## <dl>
  53. ## <dt>this</dt>
  54. ## <dd>
  55. ## <p>that</p>
  56. ## </dd>
  57. ## </dl>
  58. ##
  59. ## <ul>
  60. ## <li>
  61. ## <p>that</p>
  62. ## </li>
  63. ## </ul>
  64. for ltype in [["dl","dd"], ["ul","li"]]:
  65. let desc_item = if ltype[0]=="dl": "<dt>this</dt>" else: ""
  66. let item = "$1<$2><p>that</p></$2>" % [desc_item, ltype[1]]
  67. let list = """ <$1>
  68. $2
  69. </$1> """ % [ltype[0], item]
  70. var errors : seq[string] = @[]
  71. let parseH = parseHtml(newStringStream(list),"statichtml", errors =errors)
  72. if $parseH.findAll(ltype[1])[0].child("p") != "<p>that</p>":
  73. echo "case " & ltype[0] & " failed !"
  74. quit(2)
  75. echo "true"
  76. block t6154:
  77. let foo = """
  78. <!DOCTYPE html>
  79. <html>
  80. <head>
  81. <title> foobar </title>
  82. </head>
  83. <body>
  84. <p class=foo id=bar></p>
  85. <p something=&#9;foo&#9;bar&#178;></p>
  86. <p something= &#9;foo&#9;bar&#178; foo =bloo></p>
  87. <p class="foo2" id="bar2"></p>
  88. <p wrong= ></p>
  89. <p data-foo data-bar="correct!" enabled ></p>
  90. <p quux whatever></p>
  91. </body>
  92. </html>
  93. """
  94. var errors: seq[string] = @[]
  95. let html = parseHtml(newStringStream(foo), "statichtml", errors=errors)
  96. doAssert "statichtml(11, 18) Error: attribute value expected" in errors
  97. let ps = html.findAll("p")
  98. doAssert ps.len == 7
  99. doAssert ps[0].attrsLen == 2
  100. doAssert ps[0].attr("class") == "foo"
  101. doAssert ps[0].attr("id") == "bar"
  102. doAssert ps[0].len == 0
  103. doAssert ps[1].attrsLen == 1
  104. doAssert ps[1].attr("something") == "\tfoo\tbar²"
  105. doAssert ps[1].len == 0
  106. doAssert ps[2].attrsLen == 2
  107. doAssert ps[2].attr("something") == "\tfoo\tbar²"
  108. doAssert ps[2].attr("foo") == "bloo"
  109. doAssert ps[2].len == 0
  110. doAssert ps[3].attrsLen == 2
  111. doAssert ps[3].attr("class") == "foo2"
  112. doAssert ps[3].attr("id") == "bar2"
  113. doAssert ps[3].len == 0
  114. doAssert ps[4].attrsLen == 1
  115. doAssert ps[4].attr("wrong") == ""
  116. doAssert ps[5].attrsLen == 3
  117. doAssert ps[5].attr("data-foo") == ""
  118. doAssert ps[5].attr("data-bar") == "correct!"
  119. doAssert ps[5].attr("enabled") == ""
  120. doAssert ps[5].len == 0
  121. doAssert ps[6].attrsLen == 2
  122. doAssert ps[6].attr("quux") == ""
  123. doAssert ps[6].attr("whatever") == ""
  124. doAssert ps[6].len == 0
  125. # bug #11713, #1034
  126. var content = """
  127. # with &
  128. <img src="https://example.com/test?format=jpg&name=orig" alt="">
  129. <img src="https://example.com/test?format=jpg&name=orig" alt="text">
  130. # without &
  131. <img src="https://example.com/test?format=jpg" alt="text">
  132. """
  133. var
  134. stream = newStringStream(content)
  135. body = parseHtml(stream)
  136. for y in body.findAll("img"):
  137. echo y.attr("src"), "##", y.attr("alt")