12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 |
- #!/bin/python
- import sys
- import os
- import bs4
- from bs4 import BeautifulSoup
- root = 'strlst.myogaya.jp'
- new_root = 'localhost'
- def verify(filepath):
- return os.path.exists(file_path)
- def convert(filepath):
- if os.path.isdir(filepath):
- for root, dirs, files in os.walk(filepath):
- path = root.split(os.sep)
- target_root = './' + '/'.join(path).replace(filepath, '')
- if target_root != './':
- try:
- os.mkdir(target_root)
- except OSError as e:
- pass
- for file in files:
- if not file.lower().endswith(('.html')):
- continue
- target_dir = target_root + '/' if target_root != './' else target_root
- target_file = (target_dir + file).replace('html', 'gmi')
- target_link = 'gemini://{}/{}'.format(new_root, target_dir.replace('./', ''))
- with open(target_file, 'w+') as opened_file:
- process_file(root + '/' + file, opened_file, target_link)
- def process_file(source, target_file, target_link):
- with open(source, 'r') as file:
- html = file.read().replace('\n', '')
- soup = BeautifulSoup(html, features='lxml')
- dive(soup.html.body, target_file, target_link)
- def dive(tag, target_file, target_link):
- name = tag.name
- if isinstance(tag, bs4.element.NavigableString):
- name = tag.parent.name
- if tag.string:
- if tag.name == 'h1':
- target_file.write('# {}\n'.format(tag.string))
- elif tag.name == 'h2':
- target_file.write('## {}\n'.format(tag.string))
- elif tag.name == 'h3':
- target_file.write('### {}\n'.format(tag.string))
- elif tag.name == 'li' or name == 'li':
- target_file.write('* {}\n'.format(tag.string))
- elif tag.name == 'a':
- absolute_href = tag['href'] if 'http' in tag['href'] else (target_link + tag['href']).replace('html', 'gmi')
- final_link = absolute_href.replace('https', 'gemini').replace('html', 'gmi').replace(root, new_root) if root in absolute_href else absolute_href
- target_file.write('=> {}\n'.format(final_link, tag.string))
- elif tag.name == 'p':
- target_file.write('{}\n\n'.format(tag.string))
- elif tag.name == 'tr':
- target_file.write('{}\n\n'.format(tag.string))
- if isinstance(tag, bs4.element.NavigableString):
- return
- if len(list(tag.children)) == 0:
- return
- for c in tag.children:
- dive(c, target_file, target_link)
- if __name__ == "__main__":
- usage = """usage: ./main.py html-root [root url] [new root url]"""
- if len(sys.argv) < 2:
- print('supply filepath')
- exit(1)
- if not os.path.exists(sys.argv[1]):
- print('supply valid filepath')
- exit(1)
- if len(sys.argv) > 2:
- root = str(sys.argv[2])
- if len(sys.argv) > 3:
- new_root = str(sys.argv[3])
- convert(sys.argv[1])
|