123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324 |
- #coding=utf-8
- #!/usr/bin/python
- import sys
- sys.path.append('..')
- from base.spider import Spider
- import json
- import re
- from urllib import request, parse
- import urllib
- import urllib.request
- class Spider(Spider): # 元类 默认的元类 type
- def getName(self):
- return "西瓜影视"#此网站的大部分资源都是盗的那几个正规视频网站的资源
- def init(self,extend=""):
- print("============{0}============".format(extend))
- pass
- def isVideoFormat(self,url):
- pass
- def manualVideoCheck(self):
- pass
- def homeContent(self,filter):
- result = {}
- cateManual = {
- "电影片库": "20",
- "连续剧片库": "37",
- "动漫片库": "43",
- "综艺片库": "45",
- "B站片库": "47"
- }
- classes = []
- for k in cateManual:
- classes.append({
- 'type_name':k,
- 'type_id':cateManual[k]
- })
- result['class'] = classes
- if(filter):
- result['filters'] = self.config['filter']
- return result
- def homeVideoContent(self):
- htmlTxt=self.webReadFile(urlStr='http://www.xiguazx.com/',header=self.header)
- videos=self.get_list(html=htmlTxt,patternTxt=r'<div class="module-item-pic">\s*\n*\s*<a href="(?P<url>.+?)" title="(?P<title>.+?)" >\s*\n*\s*<i class="icon-play"></i>\s*\n*\s*</a>\s*\n*\s*<img class="lazy lazyloaded"\s*\n*\s*data-src="(?P<img>.+?)"')
- result = {
- 'list':videos
- }
- return result
- def categoryContent(self,tid,pg,filter,extend):
- result = {}
- videos=[]
- types=tid
- letter=''#字母='/letter/'+extend['captions']
- # if 'cat' in extend.keys():
- # types=extend['cat']
- # if 'letter' in extend.keys():
- # types='/letter/'+extend['letter']
-
- url='http://www.xiguazx.com/index.php/vod/show/id/{0}{2}/page/{1}.html'.format(types,pg,letter)
- #http://www.xiguazx.com/index.php/vod/show/id/20/page/2.html
- #http://www.xiguazx.com/index.php/vod/show/id/guoyu/page/20.html
- htmlTxt=self.webReadFile(urlStr=url,header=self.header)
-
- videos=self.get_list(html=htmlTxt,patternTxt=r'<div class="module-item-pic">\s*\n*\s*<a href="(?P<url>.+?)" title="(?P<title>.+?)" >\s*\n*\s*<i class="icon-play"></i>\s*\n*\s*</a>\s*\n*\s*<img class="lazy lazyloaded"\s*\n*\s*data-src="(?P<img>.+?)"')
-
- listCount=len(videos)
- pagecount=self.custom_RegexGetText(Text=htmlTxt,RegexText=r'<a href="/index.php/vod/show/id/'+types+'/page/(\d+?)\.html" class="page-number page-next display" title="尾页">尾页</a>',Index=1)
- # print(pagecount)
- if pagecount=='':
- pagecount=999
- result['list'] = videos
- result['page'] = pg
- result['pagecount'] =pg if listCount<1 else int(pagecount)
- result['limit'] = listCount
- result['total'] = 99999
- return result
- def detailContent(self,array):
- result = {}
- aid = array[0].split('###')
- title = aid[0]#片名
- urlId = ''+aid[1]#URL
- logo = aid[2]#封面
- year=''#年份
- area=''
- actor=''
- director=''
- content=''
- vodItems=[]
- typeName=''
- vod_play_from=[]#线路
- vod_play_url=[]#剧集
- htmlTxt=self.webReadFile(urlStr=urlId,header=self.header)
- vod_play_from=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a class="titleName cr3 swiper-slide active">(.+?)</a>',Index=1)
- print(vod_play_from)
- if len(vod_play_from)<1:
- return {'list': []}
- circuit=self.custom_lineList(Txt=htmlTxt,mark=r'<ul class="content_playlist flex wrap">',after='</ul>')
- #print(circuit)
- for v in circuit:
- vodItems = self.get_EpisodesList(html=v,RegexText='<li><a class="btn" href="(?P<url>.+?)">(?P<title>.+?)</a></li>')
- joinStr = "#".join(vodItems)
- vod_play_url.append(joinStr)
- #print(vod_play_url)
-
- temporary=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a href="/index.php/vod/search/class/.+?.html" target="_blank">(.+?)</a>',Index=1)
- typeName= "/".join(temporary)
- year=self.custom_RegexGetText(Text=htmlTxt,RegexText=r'<a href="/index.php/vod/search/year/.+?.html" target="_blank">(.+?)</a>',Index=1)
- area=''
- temporary=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a href="/index.php/vod/search/actor/.+?.html" target="_blank">(.+?)</a>',Index=1)
- actor="/".join(temporary)
- temporary=self.get_RegexGetTextLine(Text=htmlTxt,RegexText=r'<a href="/index.php/vod/search/director/.+?.html" target="_blank">(.+?)</a>',Index=1)
- director="/".join(temporary)
- content=self.custom_RegexGetText(Text=htmlTxt,RegexText=r'<div class="summary detailsTxt">(.+?)<',Index=1)
- vod = {
- "vod_id":array[0],
- "vod_name":title,
- "vod_pic":logo,
- "type_name":typeName,
- "vod_year":self.removeHtml(txt=year),
- "vod_area":self.removeHtml(txt=area),
- "vod_remarks":"",
- "vod_actor":self.removeHtml(txt=actor),
- "vod_director":self.removeHtml(txt=director),
- "vod_content":self.removeHtml(txt=content)
- }
- vod['vod_play_from'] = "$$$".join(vod_play_from)
- vod['vod_play_url'] = "$$$".join(vod_play_url)
- result = {
- 'list':[
- vod
- ]
- }
- return result
-
-
- def searchContent(self,key,quick):
- url='http://www.xiguazx.com/index.php/vod/search.html?wd='+urllib.parse.quote(key)
- htmlTxt=self.webReadFile(urlStr=url,header=self.header)
- videos = self.custom_list_search(html=htmlTxt)
- result = {
- 'list':videos
- }
- return result
-
- def playerContent(self,flag,id,vipFlags):
- result={}
- parse=1
- jx=1
- url=id
- htmlTxt=self.webReadFile(urlStr=url,header=self.header)
- temporary=self.custom_lineList(Txt=htmlTxt,mark=r'var player_aaaa=',after='</script>')
- if len(temporary)>0:
- jRoot=json.loads(temporary[0][16:])
- url=jRoot['url']
- if len(url)<5:
- url=id
- jx=self.ifJx(url=url)
- result["parse"] = parse#1=嗅探,0=播放
- result["playUrl"] = ''
- result["url"] = url
- result['jx'] = jx#1=VIP解析,0=不解析
- result["header"] = ''
- return result
- def localProxy(self,param):
- pass
- config = {
- "player": {},
- "filter": {
- "20":[
- {"key":"cat","name":"类型","value":[{"n":"全部","v":"20"},{"n":"动作片","v":"56"},{"n":"喜剧片","v":"57"},{"n":"爱情片","v":"58"},{"n":"科幻片","v":"59"},{"n":"剧情片","v":"61"},{"n":"战争片","v":"62"},{"n":"惊悚片","v":"63"},{"n":"犯罪片","v":"64"},{"n":"冒险片","v":"65"},{"n":"动画片","v":"66"},{"n":"悬疑片","v":"67"},{"n":"武侠片","v":"68"},{"n":"奇幻片","v":"69"},{"n":"记录片","v":"70"},{"n":"其它片","v":"71"}]},
- {"key":"letter","name":"字母","value":[{"n":"全部","v":""},{"n":"A","v":"A"},{"n":"C","v":"C"},{"n":"E","v":"E"},{"n":"F","v":"F"},{"n":"G","v":"G"},{"n":"H","v":"H"},{"n":"I","v":"I"},{"n":"J","v":"J"},{"n":"K","v":"K"},{"n":"L","v":"L"},{"n":"M","v":"M"},{"n":"N","v":"N"},{"n":"O","v":"O"},{"n":"P","v":"P"},{"n":"Q","v":"Q"},{"n":"R","v":"R"},{"n":"S","v":"S"},{"n":"T","v":"T"},{"n":"U","v":"U"},{"n":"V","v":"V"},{"n":"W","v":"W"},{"n":"X","v":"X"},{"n":"Y","v":"Y"},{"n":"Z","v":"Z"},{"n":"0-9","v":"0-9"}]}
- ],
- "72":[
- {"key":"cat","name":"类型","value":[{"n":"全部","v":"72"},{"n":"国产剧","v":"73"},{"n":"港台剧","v":"74"},{"n":"欧美剧","v":"75"},{"n":"日韩剧","v":"76"},{"n":"其他剧","v":"77"}]},
- {"key":"letter","name":"字母","value":[{"n":"全部","v":""},{"n":"A","v":"A"},{"n":"C","v":"C"},{"n":"E","v":"E"},{"n":"F","v":"F"},{"n":"G","v":"G"},{"n":"H","v":"H"},{"n":"I","v":"I"},{"n":"J","v":"J"},{"n":"K","v":"K"},{"n":"L","v":"L"},{"n":"M","v":"M"},{"n":"N","v":"N"},{"n":"O","v":"O"},{"n":"P","v":"P"},{"n":"Q","v":"Q"},{"n":"R","v":"R"},{"n":"S","v":"S"},{"n":"T","v":"T"},{"n":"U","v":"U"},{"n":"V","v":"V"},{"n":"W","v":"W"},{"n":"X","v":"X"},{"n":"Y","v":"Y"},{"n":"Z","v":"Z"},{"n":"0-9","v":"0-9"}]}
- ],
- "78":[
- {"key":"cat","name":"类型","value":[{"n":"全部","v":"78"},{"n":"动漫","v":"79"}]},
- {"key":"letter","name":"字母","value":[{"n":"全部","v":""},{"n":"A","v":"A"},{"n":"C","v":"C"},{"n":"E","v":"E"},{"n":"F","v":"F"},{"n":"G","v":"G"},{"n":"H","v":"H"},{"n":"I","v":"I"},{"n":"J","v":"J"},{"n":"K","v":"K"},{"n":"L","v":"L"},{"n":"M","v":"M"},{"n":"N","v":"N"},{"n":"O","v":"O"},{"n":"P","v":"P"},{"n":"Q","v":"Q"},{"n":"R","v":"R"},{"n":"S","v":"S"},{"n":"T","v":"T"},{"n":"U","v":"U"},{"n":"V","v":"V"},{"n":"W","v":"W"},{"n":"X","v":"X"},{"n":"Y","v":"Y"},{"n":"Z","v":"Z"},{"n":"0-9","v":"0-9"}]}
- ],
- "80":[
- {"key":"cat","name":"类型","value":[{"n":"全部","v":"80"},{"n":"综艺","v":"81"}]},
- {"key":"letter","name":"字母","value":[{"n":"全部","v":""},{"n":"A","v":"A"},{"n":"C","v":"C"},{"n":"E","v":"E"},{"n":"F","v":"F"},{"n":"G","v":"G"},{"n":"H","v":"H"},{"n":"I","v":"I"},{"n":"J","v":"J"},{"n":"K","v":"K"},{"n":"L","v":"L"},{"n":"M","v":"M"},{"n":"N","v":"N"},{"n":"O","v":"O"},{"n":"P","v":"P"},{"n":"Q","v":"Q"},{"n":"R","v":"R"},{"n":"S","v":"S"},{"n":"T","v":"T"},{"n":"U","v":"U"},{"n":"V","v":"V"},{"n":"W","v":"W"},{"n":"X","v":"X"},{"n":"Y","v":"Y"},{"n":"Z","v":"Z"},{"n":"0-9","v":"0-9"}]}
- ],
- "82":[
- {"key":"cat","name":"类型","value":[{"n":"全部","v":"82"},{"n":"番剧(B站)","v":"83"},{"n":"国创(B站)","v":"84"},{"n":"电影(B站)","v":"85"},{"n":"电视剧(B站)","v":"86"}]},
- {"key":"letter","name":"字母","value":[{"n":"全部","v":""},{"n":"A","v":"A"},{"n":"C","v":"C"},{"n":"E","v":"E"},{"n":"F","v":"F"},{"n":"G","v":"G"},{"n":"H","v":"H"},{"n":"I","v":"I"},{"n":"J","v":"J"},{"n":"K","v":"K"},{"n":"L","v":"L"},{"n":"M","v":"M"},{"n":"N","v":"N"},{"n":"O","v":"O"},{"n":"P","v":"P"},{"n":"Q","v":"Q"},{"n":"R","v":"R"},{"n":"S","v":"S"},{"n":"T","v":"T"},{"n":"U","v":"U"},{"n":"V","v":"V"},{"n":"W","v":"W"},{"n":"X","v":"X"},{"n":"Y","v":"Y"},{"n":"Z","v":"Z"},{"n":"0-9","v":"0-9"}]}
- ]
- }
- }
- header = {
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
- "Host": "www.xiguazx.com",
- "Referer": "http://www.xiguazx.com/"
- }
- #-----------------------------------------------自定义函数-----------------------------------------------
- #访问网页
- def webReadFile(self,urlStr,header):
- html=''
- req=urllib.request.Request(url=urlStr,headers=header)#,headers=header
- with urllib.request.urlopen(req) as response:
- html = response.read().decode('utf-8')
- return html
- #正则取文本
- def custom_RegexGetText(self,Text,RegexText,Index):
- returnTxt=""
- Regex=re.search(RegexText, Text, re.M|re.S)
- if Regex is None:
- returnTxt=""
- else:
- returnTxt=Regex.group(Index)
- return returnTxt
- #取集数
- def get_EpisodesList(self,html,RegexText):
- ListRe=re.finditer(RegexText, html, re.M|re.S)
- videos = []
- head="http://www.xiguazx.com"
- for vod in ListRe:
- url = vod.group('url')
- title =vod.group('title')
- if len(url) == 0:
- continue
- videos.append(self.removeHtml(txt=title)+"$"+head+url)
- #print(title)
- return videos
- #取剧集区
- def custom_lineList(self,Txt,mark,after):
- circuit=[]
- origin=Txt.find(mark)
- while origin>8:
- end=Txt.find(after,origin)
- circuit.append(Txt[origin:end])
- origin=Txt.find(mark,end)
- return circuit
- #正则取文本,返回数组
- def get_RegexGetTextLine(self,Text,RegexText,Index):
- returnTxt=[]
- pattern = re.compile(RegexText, re.M|re.S)
- ListRe=pattern.findall(Text)
- if len(ListRe)<1:
- return returnTxt
- for value in ListRe:
- returnTxt.append(value)
- return returnTxt
- #分类取结果
- def get_list(self,html,patternTxt):
- videos = []
- head="http://www.xiguazx.com"
- temporaryList=self.custom_lineList(Txt=html,mark='<a target="_top"',after='</a>')
- # print(temporaryList[0])
- # return videos
- for vod in temporaryList:
- img=self.custom_RegexGetText(Text=vod,RegexText=r"url\('(.+?)'\)",Index=1)
- title=self.custom_RegexGetText(Text=vod,RegexText=r'title=".+?">(.+?)</div>',Index=1)
- url=self.custom_RegexGetText(Text=vod,RegexText=r"href='(.+?)'",Index=1)
- renew=self.custom_RegexGetText(Text=vod,RegexText=r'<a href="/vodsearch/-------------(\d{4}).html" target="_blank">\1</a>',Index=1)
- if title=='' or url=='':
- continue
-
- if len(img)<5:
- img='https://agit.ai/lanhaidixingren/Tvbox/raw/branch/master/CoverError.png'
- videos.append({
- "vod_id":"{0}###{1}###{2}".format(title,head+url,img),
- "vod_name":title,
- "vod_pic":img,
- "vod_remarks":renew
- })
- res = [i for n, i in enumerate(videos) if i not in videos[:n]]
- videos = res
- return videos
- #删除html标签
- def removeHtml(self,txt):
- soup = re.compile(r'<[^>]+>',re.S)
- txt =soup.sub('', txt)
- return txt.replace(" "," ")
- #是否是vip解析
- def ifJx(self,url):
- Isjiexi=0
- RegexTxt=r'(youku.com|v.qq|bilibili|iqiyi.com|tv.cctv|c(c|n)tv|v.pptv|mgtv.com)'
- if self.custom_RegexGetText(Text=url,RegexText=RegexTxt,Index=1)!='':
- Isjiexi=1
- return Isjiexi
- def custom_list_search(self,html):
- videos = []
- head="http://www.xiguazx.com"
- temporaryList=self.custom_lineList(Txt=html,mark='<div class="vod-search-list">',after=' <div class="movie-rating cor4">')
- for vod in temporaryList:
- img=self.custom_RegexGetText(Text=vod,RegexText=r'data-original="(.+?)"',Index=1)
- title=self.custom_RegexGetText(Text=vod,RegexText=r'<div class="movie-title txtHide txtHide" title="(.+?)">',Index=1)
- url=self.custom_RegexGetText(Text=vod,RegexText=r'href="(.+?)"',Index=1)
- if title=='' or url=='':
- continue
- if len(img)<5:
- img='https://agit.ai/lanhaidixingren/Tvbox/raw/branch/master/CoverError.png'
- videos.append({
- "vod_id":"{0}###{1}###{2}".format(title,head+url,img),
- "vod_name":title,
- "vod_pic":img,
- "vod_remarks":''
- })
- return videos
- # T=Spider()
- # l=T.homeVideoContent()
- # l=T.searchContent(key='柯南',quick='')
- # l=T.categoryContent(tid='37',pg='1',filter=False,extend='')
- # for x in l['list']:
- # print(x['vod_id'])
- # mubiao= l['list'][2]['vod_id']
- # playTabulation=T.detailContent(array=[mubiao,])
- # # print(playTabulation)
- # vod_play_from=playTabulation['list'][0]['vod_play_from']
- # vod_play_url=playTabulation['list'][0]['vod_play_url']
- # url=vod_play_url.split('$$$')
- # vod_play_from=vod_play_from.split('$$$')[0]
- # url=url[0].split('$')
- # url=url[1].split('#')[0]
- # # print(vod_play_from)
- # m3u8=T.playerContent(flag=vod_play_from,id=url,vipFlags=True)
- # print(m3u8)
|