main.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. #!/bin/python
  2. import sys
  3. import os
  4. import bs4
  5. from bs4 import BeautifulSoup
  6. root = 'strlst.myogaya.jp'
  7. new_root = 'localhost'
  8. def verify(filepath):
  9. return os.path.exists(file_path)
  10. def convert(filepath):
  11. if os.path.isdir(filepath):
  12. for root, dirs, files in os.walk(filepath):
  13. path = root.split(os.sep)
  14. target_root = './' + '/'.join(path).replace(filepath, '')
  15. if target_root != './':
  16. try:
  17. os.mkdir(target_root)
  18. except OSError as e:
  19. pass
  20. for file in files:
  21. if not file.lower().endswith(('.html')):
  22. continue
  23. target_dir = target_root + '/' if target_root != './' else target_root
  24. target_file = (target_dir + file).replace('html', 'gmi')
  25. target_link = 'gemini://{}/{}'.format(new_root, target_dir.replace('./', ''))
  26. with open(target_file, 'w+') as opened_file:
  27. process_file(root + '/' + file, opened_file, target_link)
  28. def process_file(source, target_file, target_link):
  29. with open(source, 'r') as file:
  30. html = file.read().replace('\n', '')
  31. soup = BeautifulSoup(html, features='lxml')
  32. dive(soup.html.body, target_file, target_link)
  33. def dive(tag, target_file, target_link):
  34. name = tag.name
  35. if isinstance(tag, bs4.element.NavigableString):
  36. name = tag.parent.name
  37. if tag.string:
  38. if tag.name == 'h1':
  39. target_file.write('# {}\n'.format(tag.string))
  40. elif tag.name == 'h2':
  41. target_file.write('## {}\n'.format(tag.string))
  42. elif tag.name == 'h3':
  43. target_file.write('### {}\n'.format(tag.string))
  44. elif tag.name == 'li' or name == 'li':
  45. target_file.write('* {}\n'.format(tag.string))
  46. elif tag.name == 'a':
  47. absolute_href = tag['href'] if 'http' in tag['href'] else (target_link + tag['href']).replace('html', 'gmi')
  48. final_link = absolute_href.replace('https', 'gemini').replace('html', 'gmi').replace(root, new_root) if root in absolute_href else absolute_href
  49. target_file.write('=> {}\n'.format(final_link, tag.string))
  50. elif tag.name == 'p':
  51. target_file.write('{}\n\n'.format(tag.string))
  52. elif tag.name == 'tr':
  53. target_file.write('{}\n\n'.format(tag.string))
  54. if isinstance(tag, bs4.element.NavigableString):
  55. return
  56. if len(list(tag.children)) == 0:
  57. return
  58. for c in tag.children:
  59. dive(c, target_file, target_link)
  60. if __name__ == "__main__":
  61. usage = """usage: ./main.py html-root [root url] [new root url]"""
  62. if len(sys.argv) < 2:
  63. print('supply filepath')
  64. exit(1)
  65. if not os.path.exists(sys.argv[1]):
  66. print('supply valid filepath')
  67. exit(1)
  68. if len(sys.argv) > 2:
  69. root = str(sys.argv[2])
  70. if len(sys.argv) > 3:
  71. new_root = str(sys.argv[3])
  72. convert(sys.argv[1])