py_xiguazx.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. #coding=utf-8
  2. #!/usr/bin/python
  3. import sys
  4. sys.path.append('..')
  5. from base.spider import Spider
  6. import json
  7. import re
  8. from urllib import request, parse
  9. import urllib
  10. import urllib.request
  11. class Spider(Spider): # 元类 默认的元类 type
  12. def getName(self):
  13. return "西瓜影视"#此网站的大部分资源都是盗的那几个正规视频网站的资源
  14. def init(self,extend=""):
  15. print("============{0}============".format(extend))
  16. pass
  17. def isVideoFormat(self,url):
  18. pass
  19. def manualVideoCheck(self):
  20. pass
  21. def homeContent(self,filter):
  22. result = {}
  23. cateManual = {
  24. "电影片库": "20",
  25. "连续剧片库": "37",
  26. "动漫片库": "43",
  27. "综艺片库": "45",
  28. "B站片库": "47"
  29. }
  30. classes = []
  31. for k in cateManual:
  32. classes.append({
  33. 'type_name':k,
  34. 'type_id':cateManual[k]
  35. })
  36. result['class'] = classes
  37. if(filter):
  38. result['filters'] = self.config['filter']
  39. return result
  40. def homeVideoContent(self):
  41. htmlTxt=self.webReadFile(urlStr='http://www.xiguazx.com/',header=self.header)
  42. videos=self.get_list(html=htmlTxt,patternTxt=r'<div class="module-item-pic">\s*\n*\s*<a href="(?P<url>.+?)" title="(?P<title>.+?)" >\s*\n*\s*<i class="icon-play"></i>\s*\n*\s*</a>\s*\n*\s*<img class="lazy lazyloaded"\s*\n*\s*data-src="(?P<img>.+?)"')
  43. result = {
  44. 'list':videos
  45. }
  46. return result
  47. def categoryContent(self,tid,pg,filter,extend):
  48. result = {}
  49. videos=[]
  50. types=tid
  51. letter=''#字母='/letter/'+extend['captions']
  52. # if 'cat' in extend.keys():
  53. # types=extend['cat']
  54. # if 'letter' in extend.keys():
  55. # types='/letter/'+extend['letter']
  56. url='http://www.xiguazx.com/index.php/vod/show/id/{0}{2}/page/{1}.html'.format(types,pg,letter)
  57. #http://www.xiguazx.com/index.php/vod/show/id/20/page/2.html
  58. #http://www.xiguazx.com/index.php/vod/show/id/guoyu/page/20.html
  59. htmlTxt=self.webReadFile(urlStr=url,header=self.header)
  60. videos=self.get_list(html=htmlTxt,patternTxt=r'<div class="module-item-pic">\s*\n*\s*<a href="(?P<url>.+?)" title="(?P<title>.+?)" >\s*\n*\s*<i class="icon-play"></i>\s*\n*\s*</a>\s*\n*\s*<img class="lazy lazyloaded"\s*\n*\s*data-src="(?P<img>.+?)"')
  61. listCount=len(videos)
  62. pagecount=self.custom_RegexGetText(Text=htmlTxt,RegexText=r'<a href="/index.php/vod/show/id/'+types+'/page/(\d+?)\.html" class="page-number page-next display" title="尾页">尾页</a>',Index=1)
  63. # print(pagecount)
  64. if pagecount=='':
  65. pagecount=999
  66. result['list'] = videos
  67. result['page'] = pg
  68. result['pagecount'] =pg if listCount<1 else int(pagecount)
  69. result['limit'] = listCount
  70. result['total'] = 99999
  71. return result
  72. def detailContent(self,array):
  73. result = {}
  74. aid = array[0].split('###')
  75. title = aid[0]#片名
  76. urlId = ''+aid[1]#URL
  77. logo = aid[2]#封面
  78. year=''#年份
  79. area=''
  80. actor=''
  81. director=''
  82. content=''
  83. vodItems=[]
  84. typeName=''
  85. vod_play_from=[]#线路
  86. vod_play_url=[]#剧集
  87. htmlTxt=self.webReadFile(urlStr=urlId,header=self.header)
  88. vod_play_from=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a class="titleName cr3 swiper-slide active">(.+?)</a>',Index=1)
  89. print(vod_play_from)
  90. if len(vod_play_from)<1:
  91. return {'list': []}
  92. circuit=self.custom_lineList(Txt=htmlTxt,mark=r'<ul class="content_playlist flex wrap">',after='</ul>')
  93. #print(circuit)
  94. for v in circuit:
  95. vodItems = self.get_EpisodesList(html=v,RegexText='<li><a class="btn" href="(?P<url>.+?)">(?P<title>.+?)</a></li>')
  96. joinStr = "#".join(vodItems)
  97. vod_play_url.append(joinStr)
  98. #print(vod_play_url)
  99. temporary=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a href="/index.php/vod/search/class/.+?.html" target="_blank">(.+?)</a>',Index=1)
  100. typeName= "/".join(temporary)
  101. year=self.custom_RegexGetText(Text=htmlTxt,RegexText=r'<a href="/index.php/vod/search/year/.+?.html" target="_blank">(.+?)</a>',Index=1)
  102. area=''
  103. temporary=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a href="/index.php/vod/search/actor/.+?.html" target="_blank">(.+?)</a>',Index=1)
  104. actor="/".join(temporary)
  105. temporary=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a href="/index.php/vod/search/director/.+?.html" target="_blank">(.+?)</a>',Index=1)
  106. director="/".join(temporary)
  107. content=self.custom_RegexGetText(Text=htmlTxt,RegexText=r'<div class="summary detailsTxt">(.+?)<',Index=1)
  108. vod = {
  109. "vod_id":array[0],
  110. "vod_name":title,
  111. "vod_pic":logo,
  112. "type_name":typeName,
  113. "vod_year":self.removeHtml(txt=year),
  114. "vod_area":self.removeHtml(txt=area),
  115. "vod_remarks":"",
  116. "vod_actor":self.removeHtml(txt=actor),
  117. "vod_director":self.removeHtml(txt=director),
  118. "vod_content":self.removeHtml(txt=content)
  119. }
  120. vod['vod_play_from'] = "$$$".join(vod_play_from)
  121. vod['vod_play_url'] = "$$$".join(vod_play_url)
  122. result = {
  123. 'list':[
  124. vod
  125. ]
  126. }
  127. return result
  128. def searchContent(self,key,quick):
  129. url='http://www.xiguazx.com/index.php/vod/search.html?wd='+urllib.parse.quote(key)
  130. htmlTxt=self.webReadFile(urlStr=url,header=self.header)
  131. videos = self.custom_list_search(html=htmlTxt)
  132. result = {
  133. 'list':videos
  134. }
  135. return result
  136. def playerContent(self,flag,id,vipFlags):
  137. result={}
  138. parse=1
  139. jx=1
  140. url=id
  141. htmlTxt=self.webReadFile(urlStr=url,header=self.header)
  142. temporary=self.custom_lineList(Txt=htmlTxt,mark=r'var player_aaaa=',after='</script>')
  143. if len(temporary)>0:
  144. jRoot=json.loads(temporary[0][16:])
  145. url=jRoot['url']
  146. if len(url)<5:
  147. url=id
  148. jx=self.ifJx(url=url)
  149. result["parse"] = parse#1=嗅探,0=播放
  150. result["playUrl"] = ''
  151. result["url"] = url
  152. result['jx'] = jx#1=VIP解析,0=不解析
  153. result["header"] = ''
  154. return result
  155. def localProxy(self,param):
  156. pass
  157. config = {
  158. "player": {},
  159. "filter": {
  160. "20":[
  161. {"key":"cat","name":"类型","value":[{"n":"全部","v":"20"},{"n":"动作片","v":"56"},{"n":"喜剧片","v":"57"},{"n":"爱情片","v":"58"},{"n":"科幻片","v":"59"},{"n":"剧情片","v":"61"},{"n":"战争片","v":"62"},{"n":"惊悚片","v":"63"},{"n":"犯罪片","v":"64"},{"n":"冒险片","v":"65"},{"n":"动画片","v":"66"},{"n":"悬疑片","v":"67"},{"n":"武侠片","v":"68"},{"n":"奇幻片","v":"69"},{"n":"记录片","v":"70"},{"n":"其它片","v":"71"}]},
  162. {"key":"letter","name":"字母","value":[{"n":"全部","v":""},{"n":"A","v":"A"},{"n":"C","v":"C"},{"n":"E","v":"E"},{"n":"F","v":"F"},{"n":"G","v":"G"},{"n":"H","v":"H"},{"n":"I","v":"I"},{"n":"J","v":"J"},{"n":"K","v":"K"},{"n":"L","v":"L"},{"n":"M","v":"M"},{"n":"N","v":"N"},{"n":"O","v":"O"},{"n":"P","v":"P"},{"n":"Q","v":"Q"},{"n":"R","v":"R"},{"n":"S","v":"S"},{"n":"T","v":"T"},{"n":"U","v":"U"},{"n":"V","v":"V"},{"n":"W","v":"W"},{"n":"X","v":"X"},{"n":"Y","v":"Y"},{"n":"Z","v":"Z"},{"n":"0-9","v":"0-9"}]}
  163. ],
  164. "72":[
  165. {"key":"cat","name":"类型","value":[{"n":"全部","v":"72"},{"n":"国产剧","v":"73"},{"n":"港台剧","v":"74"},{"n":"欧美剧","v":"75"},{"n":"日韩剧","v":"76"},{"n":"其他剧","v":"77"}]},
  166. {"key":"letter","name":"字母","value":[{"n":"全部","v":""},{"n":"A","v":"A"},{"n":"C","v":"C"},{"n":"E","v":"E"},{"n":"F","v":"F"},{"n":"G","v":"G"},{"n":"H","v":"H"},{"n":"I","v":"I"},{"n":"J","v":"J"},{"n":"K","v":"K"},{"n":"L","v":"L"},{"n":"M","v":"M"},{"n":"N","v":"N"},{"n":"O","v":"O"},{"n":"P","v":"P"},{"n":"Q","v":"Q"},{"n":"R","v":"R"},{"n":"S","v":"S"},{"n":"T","v":"T"},{"n":"U","v":"U"},{"n":"V","v":"V"},{"n":"W","v":"W"},{"n":"X","v":"X"},{"n":"Y","v":"Y"},{"n":"Z","v":"Z"},{"n":"0-9","v":"0-9"}]}
  167. ],
  168. "78":[
  169. {"key":"cat","name":"类型","value":[{"n":"全部","v":"78"},{"n":"动漫","v":"79"}]},
  170. {"key":"letter","name":"字母","value":[{"n":"全部","v":""},{"n":"A","v":"A"},{"n":"C","v":"C"},{"n":"E","v":"E"},{"n":"F","v":"F"},{"n":"G","v":"G"},{"n":"H","v":"H"},{"n":"I","v":"I"},{"n":"J","v":"J"},{"n":"K","v":"K"},{"n":"L","v":"L"},{"n":"M","v":"M"},{"n":"N","v":"N"},{"n":"O","v":"O"},{"n":"P","v":"P"},{"n":"Q","v":"Q"},{"n":"R","v":"R"},{"n":"S","v":"S"},{"n":"T","v":"T"},{"n":"U","v":"U"},{"n":"V","v":"V"},{"n":"W","v":"W"},{"n":"X","v":"X"},{"n":"Y","v":"Y"},{"n":"Z","v":"Z"},{"n":"0-9","v":"0-9"}]}
  171. ],
  172. "80":[
  173. {"key":"cat","name":"类型","value":[{"n":"全部","v":"80"},{"n":"综艺","v":"81"}]},
  174. {"key":"letter","name":"字母","value":[{"n":"全部","v":""},{"n":"A","v":"A"},{"n":"C","v":"C"},{"n":"E","v":"E"},{"n":"F","v":"F"},{"n":"G","v":"G"},{"n":"H","v":"H"},{"n":"I","v":"I"},{"n":"J","v":"J"},{"n":"K","v":"K"},{"n":"L","v":"L"},{"n":"M","v":"M"},{"n":"N","v":"N"},{"n":"O","v":"O"},{"n":"P","v":"P"},{"n":"Q","v":"Q"},{"n":"R","v":"R"},{"n":"S","v":"S"},{"n":"T","v":"T"},{"n":"U","v":"U"},{"n":"V","v":"V"},{"n":"W","v":"W"},{"n":"X","v":"X"},{"n":"Y","v":"Y"},{"n":"Z","v":"Z"},{"n":"0-9","v":"0-9"}]}
  175. ],
  176. "82":[
  177. {"key":"cat","name":"类型","value":[{"n":"全部","v":"82"},{"n":"番剧(B站)","v":"83"},{"n":"国创(B站)","v":"84"},{"n":"电影(B站)","v":"85"},{"n":"电视剧(B站)","v":"86"}]},
  178. {"key":"letter","name":"字母","value":[{"n":"全部","v":""},{"n":"A","v":"A"},{"n":"C","v":"C"},{"n":"E","v":"E"},{"n":"F","v":"F"},{"n":"G","v":"G"},{"n":"H","v":"H"},{"n":"I","v":"I"},{"n":"J","v":"J"},{"n":"K","v":"K"},{"n":"L","v":"L"},{"n":"M","v":"M"},{"n":"N","v":"N"},{"n":"O","v":"O"},{"n":"P","v":"P"},{"n":"Q","v":"Q"},{"n":"R","v":"R"},{"n":"S","v":"S"},{"n":"T","v":"T"},{"n":"U","v":"U"},{"n":"V","v":"V"},{"n":"W","v":"W"},{"n":"X","v":"X"},{"n":"Y","v":"Y"},{"n":"Z","v":"Z"},{"n":"0-9","v":"0-9"}]}
  179. ]
  180. }
  181. }
  182. header = {
  183. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
  184. "Host": "www.xiguazx.com",
  185. "Referer": "http://www.xiguazx.com/"
  186. }
  187. #-----------------------------------------------自定义函数-----------------------------------------------
  188. #访问网页
  189. def webReadFile(self,urlStr,header):
  190. html=''
  191. req=urllib.request.Request(url=urlStr,headers=header)#,headers=header
  192. with urllib.request.urlopen(req) as response:
  193. html = response.read().decode('utf-8')
  194. return html
  195. #正则取文本
  196. def custom_RegexGetText(self,Text,RegexText,Index):
  197. returnTxt=""
  198. Regex=re.search(RegexText, Text, re.M|re.S)
  199. if Regex is None:
  200. returnTxt=""
  201. else:
  202. returnTxt=Regex.group(Index)
  203. return returnTxt
  204. #取集数
  205. def get_EpisodesList(self,html,RegexText):
  206. ListRe=re.finditer(RegexText, html, re.M|re.S)
  207. videos = []
  208. head="http://www.xiguazx.com"
  209. for vod in ListRe:
  210. url = vod.group('url')
  211. title =vod.group('title')
  212. if len(url) == 0:
  213. continue
  214. videos.append(self.removeHtml(txt=title)+"$"+head+url)
  215. #print(title)
  216. return videos
  217. #取剧集区
  218. def custom_lineList(self,Txt,mark,after):
  219. circuit=[]
  220. origin=Txt.find(mark)
  221. while origin>8:
  222. end=Txt.find(after,origin)
  223. circuit.append(Txt[origin:end])
  224. origin=Txt.find(mark,end)
  225. return circuit
  226. #正则取文本,返回数组
  227. def get_RegexGetTextLine(self,Text,RegexText,Index):
  228. returnTxt=[]
  229. pattern = re.compile(RegexText, re.M|re.S)
  230. ListRe=pattern.findall(Text)
  231. if len(ListRe)<1:
  232. return returnTxt
  233. for value in ListRe:
  234. returnTxt.append(value)
  235. return returnTxt
  236. #分类取结果
  237. def get_list(self,html,patternTxt):
  238. videos = []
  239. head="http://www.xiguazx.com"
  240. temporaryList=self.custom_lineList(Txt=html,mark='<a target="_top"',after='</a>')
  241. # print(temporaryList[0])
  242. # return videos
  243. for vod in temporaryList:
  244. img=self.custom_RegexGetText(Text=vod,RegexText=r"url\('(.+?)'\)",Index=1)
  245. title=self.custom_RegexGetText(Text=vod,RegexText=r'title=".+?">(.+?)</div>',Index=1)
  246. url=self.custom_RegexGetText(Text=vod,RegexText=r"href='(.+?)'",Index=1)
  247. renew=self.custom_RegexGetText(Text=vod,RegexText=r'<a href="/vodsearch/-------------(\d{4}).html" target="_blank">\1</a>',Index=1)
  248. if title=='' or url=='':
  249. continue
  250. if len(img)<5:
  251. img='https://agit.ai/lanhaidixingren/Tvbox/raw/branch/master/CoverError.png'
  252. videos.append({
  253. "vod_id":"{0}###{1}###{2}".format(title,head+url,img),
  254. "vod_name":title,
  255. "vod_pic":img,
  256. "vod_remarks":renew
  257. })
  258. res = [i for n, i in enumerate(videos) if i not in videos[:n]]
  259. videos = res
  260. return videos
  261. #删除html标签
  262. def removeHtml(self,txt):
  263. soup = re.compile(r'<[^>]+>',re.S)
  264. txt =soup.sub('', txt)
  265. return txt.replace("&nbsp;"," ")
  266. #是否是vip解析
  267. def ifJx(self,url):
  268. Isjiexi=0
  269. RegexTxt=r'(youku.com|v.qq|bilibili|iqiyi.com|tv.cctv|c(c|n)tv|v.pptv|mgtv.com)'
  270. if self.custom_RegexGetText(Text=url,RegexText=RegexTxt,Index=1)!='':
  271. Isjiexi=1
  272. return Isjiexi
  273. def custom_list_search(self,html):
  274. videos = []
  275. head="http://www.xiguazx.com"
  276. temporaryList=self.custom_lineList(Txt=html,mark='<div class="vod-search-list">',after=' <div class="movie-rating cor4">')
  277. for vod in temporaryList:
  278. img=self.custom_RegexGetText(Text=vod,RegexText=r'data-original="(.+?)"',Index=1)
  279. title=self.custom_RegexGetText(Text=vod,RegexText=r'<div class="movie-title txtHide txtHide" title="(.+?)">',Index=1)
  280. url=self.custom_RegexGetText(Text=vod,RegexText=r'href="(.+?)"',Index=1)
  281. if title=='' or url=='':
  282. continue
  283. if len(img)<5:
  284. img='https://agit.ai/lanhaidixingren/Tvbox/raw/branch/master/CoverError.png'
  285. videos.append({
  286. "vod_id":"{0}###{1}###{2}".format(title,head+url,img),
  287. "vod_name":title,
  288. "vod_pic":img,
  289. "vod_remarks":''
  290. })
  291. return videos
  292. # T=Spider()
  293. # l=T.homeVideoContent()
  294. # l=T.searchContent(key='柯南',quick='')
  295. # l=T.categoryContent(tid='37',pg='1',filter=False,extend='')
  296. # for x in l['list']:
  297. # print(x['vod_id'])
  298. # mubiao= l['list'][2]['vod_id']
  299. # playTabulation=T.detailContent(array=[mubiao,])
  300. # # print(playTabulation)
  301. # vod_play_from=playTabulation['list'][0]['vod_play_from']
  302. # vod_play_url=playTabulation['list'][0]['vod_play_url']
  303. # url=vod_play_url.split('$$$')
  304. # vod_play_from=vod_play_from.split('$$$')[0]
  305. # url=url[0].split('$')
  306. # url=url[1].split('#')[0]
  307. # # print(vod_play_from)
  308. # m3u8=T.playerContent(flag=vod_play_from,id=url,vipFlags=True)
  309. # print(m3u8)