htmlParser.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. # File : htmlParser.py
  4. # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
  5. # Date : 2022/8/25
  6. # upDate : 2022/11/17 支持 -- 剔除元素 多个剔除
  7. import ujson
  8. from pyquery import PyQuery as pq
  9. from urllib.parse import urljoin
  10. import re
  11. from jsonpath import jsonpath
  12. PARSE_CACHE = True # 解析缓存
  13. NOADD_INDEX = ':eq|:lt|:gt|:first|:last|^body$|^#' # 不自动加eq下标索引
  14. URLJOIN_ATTR = '(url|src|href|-original|-src|-play|-url|style)$' # 需要自动urljoin的属性
  15. SPECIAL_URL = '^(ftp|magnet|thunder|ws):' # 过滤特殊链接,不走urlJoin
  16. class jsoup:
  17. def __init__(self, MY_URL=''):
  18. self.MY_URL = MY_URL
  19. self.pdfh_html = ''
  20. self.pdfa_html = ''
  21. self.pdfh_doc = None
  22. self.pdfa_doc = None
  23. def test(self, text: str, string: str):
  24. """
  25. 正则判断字符串包含,模仿js的 //.test()
  26. :param text:
  27. :param string:
  28. :return:
  29. """
  30. searchObj = re.search(rf'{text}', string, re.M | re.I)
  31. test_ret = True if searchObj else False
  32. return test_ret
  33. def contains(self, text: str, match: str):
  34. # return match in text
  35. return text.find(match) > -1
  36. def parseHikerToJq(self, parse, first=False):
  37. """
  38. 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
  39. :param parse:
  40. :param first:
  41. :return:
  42. """
  43. if self.contains(parse, '&&'):
  44. parse = parse.split('&&') # 带&&的重新拼接
  45. new_parses = [] # 构造新的解析表达式列表
  46. for i in range(len(parse)):
  47. ps = parse[i].split(' ')[-1] # 如果分割&&后带空格就取最后一个元素
  48. if not self.test(NOADD_INDEX, ps):
  49. if not first and i >= len(parse) - 1: # 不传first且遇到最后一个,不用补eq(0)
  50. new_parses.append(parse[i])
  51. else:
  52. new_parses.append(f'{parse[i]}:eq(0)')
  53. else:
  54. new_parses.append(parse[i])
  55. parse = ' '.join(new_parses)
  56. else:
  57. ps = parse.split(' ')[-1] # 如果带空格就取最后一个元素
  58. if not self.test(NOADD_INDEX, ps) and first:
  59. parse = f'{parse}:eq(0)'
  60. return parse
  61. def getParseInfo(self, nparse):
  62. """
  63. 根据传入的单规则获取 parse规则,索引位置,排除列表 -- 可以用于剔除元素,支持多个,按标签剔除,按id剔除等操作
  64. :param nparse:
  65. :return:
  66. """
  67. excludes = [] # 定义排除列表默认值为空
  68. nparse_index = 0 # 定义位置索引默认值为0
  69. nparse_rule = nparse # 定义规则默认值为本身
  70. if self.contains(nparse, ':eq'):
  71. nparse_rule = nparse.split(':eq')[0]
  72. nparse_pos = nparse.split(':eq')[1]
  73. # print(nparse_rule)
  74. if self.contains(nparse_rule, '--'):
  75. excludes = nparse_rule.split('--')[1:]
  76. nparse_rule = nparse_rule.split('--')[0]
  77. elif self.contains(nparse_pos, '--'):
  78. excludes = nparse_pos.split('--')[1:]
  79. nparse_pos = nparse_pos.split('--')[0]
  80. try:
  81. nparse_index = int(nparse_pos.split('(')[1].split(')')[0])
  82. except:
  83. pass
  84. elif self.contains(nparse, '--'):
  85. nparse_rule = nparse.split('--')[0]
  86. excludes = nparse.split('--')[1:]
  87. # if nparse_index > 0:
  88. # print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes}')
  89. return nparse_rule, nparse_index, excludes
  90. def parseOneRule(self, doc, nparse, ret=None):
  91. """
  92. 解析空格分割后的原生表达式中的一条记录,正确处理eq的索引,返回处理后的ret
  93. :param doc: pq(html) load 后的pq对象
  94. :param nparse: 当前单个解析表达式
  95. :param ret: pd对象结果
  96. :return:
  97. """
  98. nparse_rule, nparse_index, excludes = self.getParseInfo(nparse)
  99. if not ret:
  100. ret = doc(nparse_rule)
  101. else:
  102. ret = ret(nparse_rule)
  103. # print(f'nparse_rule:{nparse_rule},nparse_index:{nparse_index},excludes:{excludes},ret:{ret}')
  104. if self.contains(nparse, ':eq'):
  105. ret = ret.eq(nparse_index)
  106. # if nparse_index > 4:
  107. # print('nparse_index',ret,not ret)
  108. if excludes and ret:
  109. # print(excludes)
  110. ret = ret.clone() # 克隆一个,免得直接remove会影响doc的缓存
  111. for exclude in excludes:
  112. # ret.remove(exclude)
  113. ret(exclude).remove()
  114. return ret
  115. def pdfa(self, html, parse: str):
  116. # 看官方文档才能解决这个问题!!!
  117. # https://pyquery.readthedocs.io/en/latest/api.html
  118. if not all([html, parse]):
  119. return []
  120. parse = self.parseHikerToJq(parse)
  121. print(f'pdfa:{parse}')
  122. if PARSE_CACHE:
  123. if self.pdfa_html != html:
  124. self.pdfa_html = html
  125. self.pdfa_doc = pq(html)
  126. doc = self.pdfa_doc
  127. else:
  128. doc = pq(html)
  129. parses = parse.split(' ')
  130. # print(parses)
  131. ret = None
  132. for nparse in parses:
  133. ret = self.parseOneRule(doc, nparse, ret)
  134. if not ret: # 可能循环取值后ret 对应eq取完无值了,pdfa直接返回空列表
  135. return []
  136. res = [item.outerHtml() for item in ret.items()]
  137. return res
  138. def pdfh(self, html, parse: str, base_url: str = ''):
  139. if not all([html, parse]):
  140. return ''
  141. if PARSE_CACHE:
  142. if self.pdfh_html != html:
  143. self.pdfh_html = html
  144. self.pdfh_doc = pq(html)
  145. doc = self.pdfh_doc
  146. else:
  147. doc = pq(html)
  148. if parse == 'body&&Text' or parse == 'Text':
  149. text = doc.text()
  150. return text
  151. elif parse == 'body&&Html' or parse == 'Html':
  152. return doc.html()
  153. option = None
  154. if self.contains(parse, '&&'):
  155. option = parse.split('&&')[-1]
  156. parse = '&&'.join(parse.split('&&')[:-1])
  157. parse = self.parseHikerToJq(parse, True)
  158. # print(f'pdfh:{parse},option:{option}')
  159. parses = parse.split(' ')
  160. # print(parses)
  161. ret = None
  162. for nparse in parses:
  163. ret = self.parseOneRule(doc, nparse, ret)
  164. # print(nparse,ret)
  165. if not ret: # 可能循环取值后ret 对应eq取完无值了,pdfh直接返回空字符串
  166. return ''
  167. if option:
  168. if option == 'Text':
  169. ret = ret.text()
  170. elif option == 'Html':
  171. ret = ret.html()
  172. else:
  173. ret = ret.attr(option) or ''
  174. if self.contains(option.lower(), 'style') and self.contains(ret, 'url('):
  175. try:
  176. ret = re.search('url\((.*?)\)', ret, re.M | re.S).groups()[0]
  177. # 2023/07/28新增 style取内部链接自动去除首尾单双引号
  178. ret = re.sub(r"^['\"]|['\"]$", '', ret)
  179. except:
  180. pass
  181. if ret and base_url:
  182. # need_add = re.search(URLJOIN_ATTR, option, re.M | re.I)
  183. need_add = self.test(URLJOIN_ATTR, option) and not self.test(SPECIAL_URL, ret)
  184. if need_add:
  185. if 'http' in ret:
  186. ret = ret[ret.find('http'):]
  187. else:
  188. ret = urljoin(base_url, ret)
  189. else:
  190. ret = ret.outerHtml()
  191. return ret
  192. def pd(self, html, parse: str, base_url: str = ''):
  193. if not base_url:
  194. base_url = self.MY_URL
  195. return self.pdfh(html, parse, base_url)
  196. def pq(self, html: str):
  197. return pq(html)
  198. def pjfh(self, html, parse: str, add_url=False):
  199. if not all([html, parse]):
  200. return ''
  201. if isinstance(html, str):
  202. # print(html)
  203. try:
  204. html = ujson.loads(html)
  205. # html = eval(html)
  206. except:
  207. print('字符串转json失败')
  208. return ''
  209. if not parse.startswith('$.'):
  210. parse = f'$.{parse}'
  211. ret = ''
  212. for ps in parse.split('||'):
  213. ret = jsonpath(html, ps)
  214. if isinstance(ret, list):
  215. ret = str(ret[0]) if ret[0] else ''
  216. else:
  217. ret = str(ret) if ret else ''
  218. if add_url and ret:
  219. ret = urljoin(self.MY_URL, ret)
  220. if ret:
  221. break
  222. # print(ret)
  223. return ret
  224. def pj(self, html, parse: str):
  225. return self.pjfh(html, parse, True)
  226. def pjfa(self, html, parse: str):
  227. if not all([html, parse]):
  228. return []
  229. if isinstance(html, str):
  230. try:
  231. html = ujson.loads(html)
  232. except:
  233. return []
  234. if not parse.startswith('$.'):
  235. parse = f'$.{parse}'
  236. # print(html)
  237. # print(parse)
  238. ret = jsonpath(html, parse)
  239. # print(ret)
  240. # print(type(ret))
  241. # print(type(ret[0]))
  242. # print(len(ret))
  243. if isinstance(ret, list) and isinstance(ret[0], list) and len(ret) == 1:
  244. # print('自动解包')
  245. ret = ret[0] # 自动解包
  246. return ret or []
  247. if __name__ == '__main__':
  248. pass