tipue_search.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tipue Search
  4. ============
  5. A Pelican plugin to serialize generated HTML to JSON
  6. that can be used by jQuery plugin - Tipue Search.
  7. Copyright (c) Talha Mansoor
  8. """
  9. from __future__ import unicode_literals
  10. import os.path
  11. import json
  12. import re
  13. from bs4 import BeautifulSoup
  14. from codecs import open
  15. try:
  16. from urlparse import urljoin
  17. except ImportError:
  18. from urllib.parse import urljoin
  19. from pelican import signals
  20. class Tipue_Search_JSON_Generator(object):
  21. def __init__(self, context, settings, path, theme, output_path, *null):
  22. self.output_path = output_path
  23. self.context = context
  24. self.siteurl = settings.get('SITEURL')
  25. self.relative_urls = settings.get('RELATIVE_URLS')
  26. self.tpages = settings.get('TEMPLATE_PAGES')
  27. self.tstatic = settings.get('THEME_STATIC_DIR')
  28. self.output_path = output_path
  29. self.json_nodes = []
  30. def normalize(self, s):
  31. replacements = (
  32. ("á", "a"),
  33. ("é", "e"),
  34. ("í", "i"),
  35. ("ó", "o"),
  36. ("ú", "u"),
  37. (".", ""),
  38. )
  39. s = s.lower()
  40. for a, b in replacements:
  41. s = s.replace(a, b).replace(a.lower(), b.lower())
  42. s = re.sub(r"([a-z]) ([a-z])", r"\1-\2", s, 0,
  43. re.IGNORECASE | re.DOTALL)
  44. return s
  45. def create_json_node(self, article):
  46. if getattr(article, 'status', 'published') != 'published':
  47. return
  48. soup_title = BeautifulSoup(
  49. article.title.replace(' ', ' '), 'html.parser')
  50. video_title = soup_title.get_text(' ', strip=True).replace(
  51. '“', '"').replace(
  52. '”', '"').replace(
  53. '’', "'").replace('^', '^')
  54. # description
  55. art_desc = BeautifulSoup(article.content, 'html.parser')
  56. # fix ignore <h1> inside <figure> description
  57. try:
  58. art_desc = art_desc.find('figure').find_all_next('p')
  59. art_desc_html = ''.join(map(str, art_desc))
  60. art_desc = BeautifulSoup(art_desc_html, 'html.parser')
  61. video_desc_html = art_desc_html.replace('\n', '&#32;')
  62. except:
  63. video_desc_html = ''.join(
  64. map(str, art_desc)).replace('\n', '&#32;')
  65. pass
  66. video_desc_text = art_desc.get_text(' ', strip=True).replace(
  67. '“', '"').replace(
  68. '”', '"').replace(
  69. '’', "'").replace(
  70. '¶', ' ').replace('^', '&#94;')
  71. video_desc_text = ' '.join(video_desc_text.split())
  72. # base url
  73. if self.relative_urls:
  74. base_url = '.'
  75. else:
  76. base_url = self.siteurl
  77. # videoid
  78. video_id = str(article.videoid) if getattr(
  79. article, 'videoid', 'None') != 'None' else ''
  80. # thumbnail
  81. video_image = article.image if getattr(
  82. article, 'image', 'None') != 'None' else ''
  83. url_image = "%s/%s/../wp-content/uploads/article/thumbnail/%s" % (
  84. base_url, self.tstatic, video_image
  85. )
  86. # publish
  87. video_publish = article.date.isoformat() if getattr(
  88. article, 'date', 'None') != 'None' else ''
  89. # publish_text
  90. video_publish_text = article.date.strftime("%a, %d %B, %Y") if getattr(
  91. article, 'date', 'None') != 'None' else ''
  92. # author
  93. video_author = str(article.author) if getattr(
  94. article, 'author', 'None') != 'None' else ''
  95. # author url
  96. video_author_url = "%s/author/%s/" % (
  97. base_url, self.normalize(video_author)
  98. )
  99. # time
  100. video_time = article.time if getattr(
  101. article, 'time', 'None') != 'None' else ''
  102. video_url = '.'
  103. if article.url:
  104. video_url = article.url if self.relative_urls else (
  105. self.siteurl + '/' + article.url)
  106. video_src = article.og_video if getattr(
  107. article, 'og_video', 'None') != 'None' else ''
  108. # category
  109. video_category = article.category.name if getattr(
  110. article, 'category', 'None') != 'None' else ''
  111. # tags
  112. data_tags = ['%s' % (tag) for tag in article.tags]
  113. video_tags = dict((num, tag) for num, tag in enumerate(data_tags))
  114. node = {
  115. 'videoId': video_id,
  116. 'title': video_title,
  117. 'description': video_desc_text,
  118. 'descriptionHtml': video_desc_html,
  119. 'videoThumbnail': url_image,
  120. 'formatStreams': {
  121. 'url': video_src,
  122. },
  123. 'author': video_author,
  124. 'authorUrl': video_author_url,
  125. 'published': video_publish,
  126. 'publishedText': video_publish_text,
  127. 'time': video_time,
  128. 'category': video_category,
  129. 'keywords': video_tags,
  130. 'url': video_url
  131. }
  132. self.json_nodes.append(node)
  133. def create_tpage_node(self, srclink):
  134. srcfile = open(os.path.join(self.output_path,
  135. self.tpages[srclink]),
  136. encoding='utf-8')
  137. soup = BeautifulSoup(srcfile, 'html.parser')
  138. video_title = soup.title.string if soup.title is not None else ''
  139. video_text = soup.get_text()
  140. # Should set default category
  141. video_category = ''
  142. video_url = urljoin(self.siteurl, self.tpages[srclink])
  143. node = {'title': video_title,
  144. 'text': video_text,
  145. 'tags': video_category,
  146. 'url': video_url}
  147. self.json_nodes.append(node)
  148. def generate_output(self, writer):
  149. path = os.path.join(self.output_path, 'tipuesearch_content.json')
  150. articles = self.context['articles']
  151. for article in self.context['articles']:
  152. articles += article.translations
  153. for srclink in self.tpages:
  154. self.create_tpage_node(srclink)
  155. for article in articles:
  156. self.create_json_node(article)
  157. root_node = {'videos': self.json_nodes}
  158. with open(path, 'w', encoding='utf-8') as fd:
  159. json.dump(root_node, fd, separators=(',', ':'), ensure_ascii=False)
  160. def get_generators(generators):
  161. return Tipue_Search_JSON_Generator
  162. def register():
  163. signals.get_generators.connect(get_generators)