extract_toc.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. # -*- coding: utf-8 -*-
  2. """
  3. Extract Table of Content
  4. ========================
  5. A Pelican plugin to extract table of contents (ToC) from `article.content` and
  6. place it in its own `article.toc` variable for use in templates.
  7. """
  8. from os import path
  9. from bs4 import BeautifulSoup
  10. from pelican import signals, readers, contents
  11. import logging
  12. logger = logging.getLogger(__name__)
  13. def extract_toc(content):
  14. if isinstance(content, contents.Static):
  15. return
  16. soup = BeautifulSoup(content._content, 'html.parser')
  17. filename = content.source_path
  18. extension = path.splitext(filename)[1][1:]
  19. toc = None
  20. # default Markdown reader
  21. if not toc and readers.MarkdownReader.enabled and extension in readers.MarkdownReader.file_extensions:
  22. toc = soup.find('div', class_='toc')
  23. if toc:
  24. toc.extract()
  25. if len(toc.find_next('ul').find_all('li')) == 0:
  26. toc = None
  27. # default reStructuredText reader
  28. if not toc and readers.RstReader.enabled and extension in readers.RstReader.file_extensions:
  29. toc = soup.find('div', class_='contents topic')
  30. if toc:
  31. toc.extract()
  32. tag = BeautifulSoup(str(toc), 'html.parser')
  33. tag.div['class'] = 'toc'
  34. tag.div['id'] = ''
  35. p = tag.find('p', class_='topic-title first')
  36. if p:
  37. p.extract()
  38. toc = tag
  39. # Pandoc reader (markdown and other formats)
  40. if 'pandoc_reader' in content.settings['PLUGINS']:
  41. try:
  42. from pandoc_reader import PandocReader
  43. except ImportError:
  44. PandocReader = False
  45. if not toc and PandocReader and PandocReader.enabled and extension in PandocReader.file_extensions:
  46. toc = soup.find('nav', id='TOC')
  47. if toc:
  48. toc.extract()
  49. content._content = soup.decode()
  50. content.toc = toc.decode()
  51. if content.toc.startswith('<html>'):
  52. content.toc = content.toc[12:-14]
  53. def register():
  54. signals.content_object_init.connect(extract_toc)