htmlParseerOld.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. # File : htmlParser.py
  4. # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
  5. # Date : 2022/8/25
  6. import json
  7. from pyquery import PyQuery as pq
  8. from lxml import etree
  9. from urllib.parse import urljoin
  10. import re
  11. from jsonpath import jsonpath
  12. PARSE_CACHE = True # 解析缓存
  13. class jsoup:
  14. def __init__(self,MY_URL=''):
  15. self.MY_URL = MY_URL
  16. self.pdfh_html = ''
  17. self.pdfa_html = ''
  18. self.pdfh_doc = None
  19. self.pdfa_doc = None
  20. def test(self, text:str, string:str):
  21. searchObj = re.search(rf'{text}', string, re.M | re.I)
  22. test_ret = True if searchObj else False
  23. return test_ret
  24. def pdfh(self,html,parse:str,add_url=False):
  25. if not parse:
  26. return ''
  27. if PARSE_CACHE:
  28. if self.pdfh_html != html:
  29. self.pdfh_html = html
  30. self.pdfh_doc = pq(html)
  31. doc = self.pdfh_doc
  32. else:
  33. doc = pq(html)
  34. if parse == 'body&&Text' or parse == 'Text':
  35. text = doc.text()
  36. return text
  37. elif parse == 'body&&Html' or parse == 'Html':
  38. return doc.html()
  39. option = None
  40. if parse.find('&&') > -1:
  41. option = parse.split('&&')[-1]
  42. parse = parse.split('&&')[:-1] # 如果只有一个&& 取的就直接是0
  43. if len(parse) > 1: # 如果不大于1可能就是option操作,不需要拼eq
  44. parse = ' '.join([i if self.test(':eq|:lt|:gt|#',i) else f'{i}:eq(0)' for i in parse])
  45. else:
  46. parse = parse[0] if self.test(':eq|:lt|:gt|#',parse[0]) else f'{parse[0]}:eq(0)'
  47. # FIXME 暂时不支持jsonpath那样的|| 分割取或属性
  48. if option:
  49. # print(f'parse:{parse}=>(option:{option})')
  50. if ':eq(-1)' in parse:
  51. # 处理 eq(-1)的情况,兼容性差,暂时只支持一层eq
  52. ret = doc(parse.replace(':eq(-1)','')).eq(-1)
  53. else:
  54. ret = doc(parse)
  55. # print(html)
  56. # FIXME 解析出来有多个的情况应该自动取第一个
  57. if option == 'Text':
  58. ret = ret.text()
  59. elif option == 'Html':
  60. ret = ret.html()
  61. else:
  62. ret = ret.attr(option) or ''
  63. if option.lower().find('style')>-1 and ret.find('url(')>-1:
  64. try:
  65. ret = re.search('url\((.*?)\)',ret,re.M|re.S).groups()[0]
  66. except:
  67. pass
  68. if ret and add_url:
  69. # pd_list = 'url|src|href|data-original|data-src|data-play|data-url'.split('|')
  70. # need_add = option in pd_list
  71. need_add = re.search('(url|src|href|-original|-src|-play|-url)$', option, re.M | re.I)
  72. # print(f'option:{option},need_add:{need_add}')
  73. if need_add:
  74. if 'http' in ret:
  75. ret = ret[ret.find('http'):]
  76. else:
  77. ret = urljoin(self.MY_URL,ret)
  78. # print(ret)
  79. else:
  80. # ret = doc(parse+':first')
  81. ret = doc(parse) # 由于是生成器,直接转str就能拿到第一条数据,不需要next
  82. # ret = ret.next() # 取第一条数据
  83. # ret = doc(parse) # 下面注释的写法不对的
  84. # ret = ret.find(':first')
  85. # ret = ret.children(':first')
  86. # print(parse)
  87. # ret = str(ret)
  88. ret = ret.outerHtml()
  89. return ret
  90. def pdfa(self,html,parse:str):
  91. # 看官方文档才能解决这个问题!!!
  92. # https://pyquery.readthedocs.io/en/latest/api.html
  93. if not parse:
  94. return []
  95. if parse.find('&&') > -1:
  96. parse = parse.split('&&') # 带&&的重新拼接
  97. # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
  98. parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
  99. print(f'pdfa:{parse}')
  100. # print(html)
  101. if PARSE_CACHE:
  102. if self.pdfa_html != html:
  103. self.pdfa_html = html
  104. self.pdfa_doc = pq(html)
  105. doc = self.pdfa_doc
  106. else:
  107. doc = pq(html)
  108. result = doc(parse)
  109. # 节点转字符串
  110. # print(str(etree.tostring(result[0], pretty_print=True), 'utf-8'))
  111. # res = [item for item in result.items()]
  112. # print(res)
  113. res = [item.outerHtml() for item in result.items()] # 这个才是对的!!str() item str(etree.tostring 统统错误
  114. # res = [str(item) for item in result.items()]
  115. # res = [str(etree.tostring(item, pretty_print=True), 'utf-8') for item in result]
  116. # print(len(res),res)
  117. # print('pdfa执行结果数:',len(res))
  118. return res
  119. def pd(self,html,parse:str):
  120. return self.pdfh(html,parse,True)
  121. def pq(self,html:str):
  122. return pq(html)
  123. def pjfh(self,html,parse:str,add_url=False):
  124. if not parse:
  125. return ''
  126. if isinstance(html,str):
  127. # print(html)
  128. try:
  129. html = json.loads(html)
  130. # html = eval(html)
  131. except:
  132. print('字符串转json失败')
  133. return ''
  134. if not parse.startswith('$.'):
  135. parse = f'$.{parse}'
  136. ret = ''
  137. for ps in parse.split('||'):
  138. ret = jsonpath(html,ps)
  139. if isinstance(ret,list):
  140. ret = str(ret[0]) if ret[0] else ''
  141. else:
  142. ret = str(ret) if ret else ''
  143. if add_url and ret:
  144. ret = urljoin(self.MY_URL, ret)
  145. if ret:
  146. break
  147. # print(ret)
  148. return ret
  149. def pj(self, html, parse:str):
  150. return self.pjfh(html, parse, True)
  151. def pjfa(self,html,parse:str):
  152. if not parse:
  153. return []
  154. if isinstance(html,str):
  155. try:
  156. html = json.loads(html)
  157. except:
  158. return ''
  159. if not parse.startswith('$.'):
  160. parse = f'$.{parse}'
  161. # print(html)
  162. # print(parse)
  163. ret = jsonpath(html,parse)
  164. # print(ret)
  165. # print(type(ret))
  166. # print(type(ret[0]))
  167. # print(len(ret))
  168. if isinstance(ret,list) and isinstance(ret[0],list) and len(ret) == 1:
  169. # print('自动解包')
  170. ret = ret[0] # 自动解包
  171. return ret or []
  172. if __name__ == '__main__':
  173. import requests
  174. from parsel import Selector
  175. url = 'http://360yy.cn'
  176. jsp = jsoup(url)
  177. def pdfa2(html,parse):
  178. if not parse:
  179. return []
  180. if parse.find('&&') > -1:
  181. parse = parse.split('&&') # 带&&的重新拼接
  182. # print(f"{parse[0]},{self.test(':eq|:lt|:gt', parse[0])}")
  183. # parse = ' '.join([parse[i] if self.test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:eq(0)' for i in range(len(parse))])
  184. parse = ' '.join([parse[i] if jsoup().test(':eq|:lt|:gt', parse[i]) or i>=len(parse)-1 else f'{parse[i]}:nth-child(1)' for i in range(len(parse))])
  185. # print(f'pdfa:{parse}')
  186. selector = Selector(text=html)
  187. print(parse)
  188. items = selector.css(parse)
  189. return [str(item) for item in items]
  190. r = requests.get(url)
  191. html = r.text
  192. # parsel 不好用啊,很难实现封装pdfa之类的函数
  193. items = pdfa2(html,'.fed-pops-navbar&&ul.fed-part-rows&&a')
  194. print(items)