123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213 |
- # -*- coding: utf-8 -*-
- """
- Tipue Search
- ============
- A Pelican plugin to serialize generated HTML to JSON
- that can be used by jQuery plugin - Tipue Search.
- Copyright (c) Talha Mansoor
- """
- from __future__ import unicode_literals
- import os.path
- import json
- import re
- from bs4 import BeautifulSoup
- from codecs import open
- try:
- from urlparse import urljoin
- except ImportError:
- from urllib.parse import urljoin
- from pelican import signals
- class Tipue_Search_JSON_Generator(object):
- def __init__(self, context, settings, path, theme, output_path, *null):
- self.output_path = output_path
- self.context = context
- self.siteurl = settings.get('SITEURL')
- self.relative_urls = settings.get('RELATIVE_URLS')
- self.tpages = settings.get('TEMPLATE_PAGES')
- self.tstatic = settings.get('THEME_STATIC_DIR')
- self.output_path = output_path
- self.json_nodes = []
- def normalize(self, s):
- replacements = (
- ("á", "a"),
- ("é", "e"),
- ("í", "i"),
- ("ó", "o"),
- ("ú", "u"),
- (".", ""),
- )
- s = s.lower()
- for a, b in replacements:
- s = s.replace(a, b).replace(a.lower(), b.lower())
- s = re.sub(r"([a-z]) ([a-z])", r"\1-\2", s, 0,
- re.IGNORECASE | re.DOTALL)
- return s
- def create_json_node(self, article):
- if getattr(article, 'status', 'published') != 'published':
- return
- soup_title = BeautifulSoup(
- article.title.replace(' ', ' '), 'html.parser')
- video_title = soup_title.get_text(' ', strip=True).replace(
- '“', '"').replace(
- '”', '"').replace(
- '’', "'").replace('^', '^')
- # description
- art_desc = BeautifulSoup(article.content, 'html.parser')
- # fix ignore <h1> inside <figure> description
- try:
- art_desc = art_desc.find('figure').find_all_next('p')
- art_desc_html = ''.join(map(str, art_desc))
- art_desc = BeautifulSoup(art_desc_html, 'html.parser')
- video_desc_html = art_desc_html.replace('\n', ' ')
- except:
- video_desc_html = ''.join(
- map(str, art_desc)).replace('\n', ' ')
- pass
- video_desc_text = art_desc.get_text(' ', strip=True).replace(
- '“', '"').replace(
- '”', '"').replace(
- '’', "'").replace(
- '¶', ' ').replace('^', '^')
- video_desc_text = ' '.join(video_desc_text.split())
- # base url
- if self.relative_urls:
- base_url = '.'
- else:
- base_url = self.siteurl
- # videoid
- video_id = str(article.videoid) if getattr(
- article, 'videoid', 'None') != 'None' else ''
- # thumbnail
- video_image = article.image if getattr(
- article, 'image', 'None') != 'None' else ''
- url_image = "%s/%s/../wp-content/uploads/article/thumbnail/%s" % (
- base_url, self.tstatic, video_image
- )
- # publish
- video_publish = article.date.isoformat() if getattr(
- article, 'date', 'None') != 'None' else ''
- # publish_text
- video_publish_text = article.date.strftime("%a, %d %B, %Y") if getattr(
- article, 'date', 'None') != 'None' else ''
- # author
- video_author = str(article.author) if getattr(
- article, 'author', 'None') != 'None' else ''
- # author url
- video_author_url = "%s/author/%s/" % (
- base_url, self.normalize(video_author)
- )
- # time
- video_time = article.time if getattr(
- article, 'time', 'None') != 'None' else ''
- video_url = '.'
- if article.url:
- video_url = article.url if self.relative_urls else (
- self.siteurl + '/' + article.url)
- video_src = article.og_video if getattr(
- article, 'og_video', 'None') != 'None' else ''
- # category
- video_category = article.category.name if getattr(
- article, 'category', 'None') != 'None' else ''
- # tags
- data_tags = ['%s' % (tag) for tag in article.tags]
- video_tags = dict((num, tag) for num, tag in enumerate(data_tags))
- node = {
- 'videoId': video_id,
- 'title': video_title,
- 'description': video_desc_text,
- 'descriptionHtml': video_desc_html,
- 'videoThumbnail': url_image,
- 'formatStreams': {
- 'url': video_src,
- },
- 'author': video_author,
- 'authorUrl': video_author_url,
- 'published': video_publish,
- 'publishedText': video_publish_text,
- 'time': video_time,
- 'category': video_category,
- 'keywords': video_tags,
- 'url': video_url
- }
- self.json_nodes.append(node)
- def create_tpage_node(self, srclink):
- srcfile = open(os.path.join(self.output_path,
- self.tpages[srclink]),
- encoding='utf-8')
- soup = BeautifulSoup(srcfile, 'html.parser')
- video_title = soup.title.string if soup.title is not None else ''
- video_text = soup.get_text()
- # Should set default category
- video_category = ''
- video_url = urljoin(self.siteurl, self.tpages[srclink])
- node = {'title': video_title,
- 'text': video_text,
- 'tags': video_category,
- 'url': video_url}
- self.json_nodes.append(node)
- def generate_output(self, writer):
- path = os.path.join(self.output_path, 'tipuesearch_content.json')
- articles = self.context['articles']
- for article in self.context['articles']:
- articles += article.translations
- for srclink in self.tpages:
- self.create_tpage_node(srclink)
- for article in articles:
- self.create_json_node(article)
- root_node = {'videos': self.json_nodes}
- with open(path, 'w', encoding='utf-8') as fd:
- json.dump(root_node, fd, separators=(',', ':'), ensure_ascii=False)
- def get_generators(generators):
- return Tipue_Search_JSON_Generator
- def register():
- signals.get_generators.connect(get_generators)
|