htmlrefs.nim 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. # Example program to show the new parsexml module
  2. # This program reads an HTML file and writes all its used links to stdout.
  3. # Errors and whitespace are ignored.
  4. import os, streams, parsexml, strutils
  5. proc `=?=` (a, b: string): bool =
  6. # little trick: define our own comparator that ignores case
  7. return cmpIgnoreCase(a, b) == 0
  8. if paramCount() < 1:
  9. quit("Usage: htmlrefs filename[.html]")
  10. var links = 0 # count the number of links
  11. var filename = addFileExt(paramStr(1), "html")
  12. var s = newFileStream(filename, fmRead)
  13. if s == nil: quit("cannot open the file " & filename)
  14. var x: XmlParser
  15. open(x, s, filename)
  16. next(x) # get first event
  17. block mainLoop:
  18. while true:
  19. case x.kind
  20. of xmlElementOpen:
  21. # the <a href = "xyz"> tag we are interested in always has an attribute,
  22. # thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
  23. if x.elementName =?= "a":
  24. x.next()
  25. if x.kind == xmlAttribute:
  26. if x.attrKey =?= "href":
  27. var link = x.attrValue
  28. inc(links)
  29. # skip until we have an ``xmlElementClose`` event
  30. while true:
  31. x.next()
  32. case x.kind
  33. of xmlEof: break mainLoop
  34. of xmlElementClose: break
  35. else: discard
  36. x.next() # skip ``xmlElementClose``
  37. # now we have the description for the ``a`` element
  38. var desc = ""
  39. while x.kind == xmlCharData:
  40. desc.add(x.charData)
  41. x.next()
  42. echo(desc & ": " & link)
  43. else:
  44. x.next()
  45. of xmlEof: break # end of file reached
  46. of xmlError:
  47. echo(errorMsg(x))
  48. x.next()
  49. else: x.next() # skip other events
  50. echo($links & " link(s) found!")
  51. x.close()