素白白.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. """
  2. 作者 老王叔叔 內容均從互聯網收集而來 僅供交流學習使用 版權歸原創者所有 如侵犯了您的權益 請通知作者 將及時刪除侵權內容
  3. ====================kaiyuebinguan====================
  4. """
  5. from Crypto.Util.Padding import unpad
  6. from urllib.parse import unquote
  7. from Crypto.Cipher import ARC4
  8. from base.spider import Spider
  9. from bs4 import BeautifulSoup
  10. import urllib.request
  11. import urllib.parse
  12. import binascii
  13. import requests
  14. import base64
  15. import json
  16. import time
  17. import sys
  18. import re
  19. import os
  20. sys.path.append('..')
  21. xurl = "https://www.subaibai.com"
  22. headerx = {
  23. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
  24. 'authority': 'www.subaibai.com',
  25. 'Referer': 'https://www.subaibai.com/',
  26. 'Origin': 'https://www.subaibai.com/'
  27. }
  28. pm = ''
  29. class Spider(Spider):
  30. global xurl
  31. global headerx
  32. def getName(self):
  33. return "首页"
  34. def init(self, extend):
  35. pass
  36. def isVideoFormat(self, url):
  37. pass
  38. def manualVideoCheck(self):
  39. pass
  40. def extract_middle_text(self, text, start_str, end_str, pl, start_index1: str = '', end_index2: str = ''):
  41. if pl == 3:
  42. plx = []
  43. while True:
  44. start_index = text.find(start_str)
  45. if start_index == -1:
  46. break
  47. end_index = text.find(end_str, start_index + len(start_str))
  48. if end_index == -1:
  49. break
  50. middle_text = text[start_index + len(start_str):end_index]
  51. plx.append(middle_text)
  52. text = text.replace(start_str + middle_text + end_str, '')
  53. if len(plx) > 0:
  54. purl = ''
  55. for i in range(len(plx)):
  56. matches = re.findall(start_index1, plx[i])
  57. output = ""
  58. for match in matches:
  59. match3 = re.search(r'(?:^|[^0-9])(\d+)(?:[^0-9]|$)', match[1])
  60. if match3:
  61. number = match3.group(1)
  62. else:
  63. number = 0
  64. if 'http' not in match[0]:
  65. output += f"#{ match[1]}${number}{xurl}{match[0]}"
  66. else:
  67. output += f"#{ match[1]}${number}{match[0]}"
  68. output = output[1:]
  69. purl = purl + output + "$$$"
  70. purl = purl[:-3]
  71. return purl
  72. else:
  73. return ""
  74. else:
  75. start_index = text.find(start_str)
  76. if start_index == -1:
  77. return ""
  78. end_index = text.find(end_str, start_index + len(start_str))
  79. if end_index == -1:
  80. return ""
  81. if pl == 0:
  82. middle_text = text[start_index + len(start_str):end_index]
  83. return middle_text.replace("\\", "")
  84. if pl == 1:
  85. middle_text = text[start_index + len(start_str):end_index]
  86. matches = re.findall(start_index1, middle_text)
  87. if matches:
  88. jg = ' '.join(matches)
  89. return jg
  90. if pl == 2:
  91. middle_text = text[start_index + len(start_str):end_index]
  92. matches = re.findall(start_index1, middle_text)
  93. if matches:
  94. new_list = [f for item in matches]
  95. jg = '$$$'.join(new_list)
  96. return jg
  97. def homeContent(self, filter):
  98. result = {}
  99. result = {"class": [{"type_id": "new-movie", "type_name": "电影"},
  100. {"type_id": "tv-drama", "type_name": "剧集"},
  101. {"type_id": "hot-month", "type_name": "热门电影"},
  102. {"type_id": "high-movie", "type_name": "高分电影"},
  103. {"type_id": "cartoon-movie", "type_name": "动漫电影"},
  104. {"type_id": "hongkong-movie", "type_name": "香港经典"},
  105. {"type_id": "domestic-drama", "type_name": "国产剧"},
  106. {"type_id": "american-drama", "type_name": "欧美剧"},
  107. {"type_id": "korean-drama", "type_name": "韩剧"},
  108. {"type_id": "anime-drama", "type_name": "动漫剧"}]
  109. }
  110. return result
  111. def homeVideoContent(self):
  112. videos = []
  113. try:
  114. detail = requests.get(url=xurl, headers=headerx)
  115. detail.encoding = "utf-8"
  116. res = detail.text
  117. doc = BeautifulSoup(res, "lxml")
  118. soups = doc.find_all('div', class_="bt_img")
  119. for soup in soups:
  120. vods = soup.find_all('li')
  121. for vod in vods:
  122. name = vod.find('img')['alt']
  123. ids = vod.find('h3', class_="dytit")
  124. id = ids.find('a')['href']
  125. id = id.replace('www.subaibaiys.com', 'www.subaibai.com')
  126. pic = vod.find('img')['data-original']
  127. if 'http' not in pic:
  128. pic = xurl + pic
  129. remark = self.extract_middle_text(str(vod), 'class="rating">', '</div>', 0)
  130. video = {
  131. "vod_id": id,
  132. "vod_name": name,
  133. "vod_pic": pic,
  134. "vod_remarks": remark
  135. }
  136. videos.append(video)
  137. result = {'list': videos}
  138. return result
  139. except:
  140. pass
  141. def categoryContent(self, cid, pg, filter, ext):
  142. result = {}
  143. videos = []
  144. if pg:
  145. page = int(pg)
  146. else:
  147. page = 1
  148. if page == '1':
  149. url = f'{xurl}/{cid}'
  150. else:
  151. url = f'{xurl}/{cid}/page/{str(page)}'
  152. try:
  153. detail = requests.get(url=url, headers=headerx)
  154. detail.encoding = "utf-8"
  155. res = detail.text
  156. doc = BeautifulSoup(res, "lxml")
  157. soups = doc.find_all('div', class_="bt_img")
  158. for soup in soups:
  159. vods = soup.find_all('li')
  160. for vod in vods:
  161. name = vod.find('img')['alt']
  162. ids = vod.find('h3', class_="dytit")
  163. id = ids.find('a')['href']
  164. id = id.replace('www.subaibaiys.com', 'www.subaibai.com')
  165. pic = vod.find('img')['data-original']
  166. if 'http' not in pic:
  167. pic = xurl + pic
  168. remark = self.extract_middle_text(str(vod), 'class="rating">', '</div>', 0)
  169. video = {
  170. "vod_id": id,
  171. "vod_name": name,
  172. "vod_pic": pic,
  173. "vod_remarks": remark
  174. }
  175. videos.append(video)
  176. except:
  177. pass
  178. result = {'list': videos}
  179. result['page'] = pg
  180. result['pagecount'] = 9999
  181. result['limit'] = 90
  182. result['total'] = 999999
  183. return result
  184. def detailContent(self, ids):
  185. global pm
  186. did = ids[0]
  187. result = {}
  188. videos = []
  189. if 'http' not in did:
  190. did = xurl + did
  191. res1 = requests.get(url=did, headers=headerx)
  192. res1.encoding = "utf-8"
  193. res = res1.text
  194. url = 'https://www.subaibai.com'
  195. response = requests.get(url)
  196. response.encoding = 'utf-8'
  197. code = response.text
  198. name = self.extract_middle_text(code, "s1='", "'", 0)
  199. Jumps = self.extract_middle_text(code, "s2='", "'", 0)
  200. content = self.extract_middle_text(res,'<div class="yp_context">','</p>', 0)
  201. content = content.replace('\t', '').replace('<p>', '').replace(' ', '').replace('\n', '')
  202. if name not in content:
  203. bofang = Jumps
  204. else:
  205. bofang = self.extract_middle_text(res, '<div class="paly_list_btn">', '</div>', 3, 'href="(.*?)">(.*?)</a>')
  206. bofang = bofang.replace('www.subaibaiys.com', 'www.subaibai.com').replace('立即播放&nbsp;&nbsp;', '')
  207. videos.append({
  208. "vod_id": did,
  209. "vod_actor": '',
  210. "vod_director": '',
  211. "vod_content": content,
  212. "vod_play_from": '专线',
  213. "vod_play_url": bofang
  214. })
  215. result['list'] = videos
  216. return result
  217. def playerContent(self, flag, id, vipFlags):
  218. parts = id.split("http")
  219. xiutan = 1
  220. if xiutan == 0:
  221. if len(parts) > 1:
  222. before_https, after_https = parts[0], 'http' + parts[1]
  223. if '' in after_https:
  224. url = after_https
  225. else:
  226. res = requests.get(url=after_https, headers=headerx)
  227. res = res.text
  228. url = self.extract_middle_text(res, '},"url":"', '"', 0).replace('\\', '')
  229. # =======================================
  230. # url = base64.b64decode(url).decode('utf-8')
  231. # url = unquote(url)
  232. # import base64
  233. # base64_decoded_bytes = base64.b64decode(url)
  234. # base64_decoded_string = base64_decoded_bytes.decode('utf-8')
  235. # url = unquote(base64_decoded_string)
  236. # url="https://"+self.extract_middle_text(url,'https://','.m3u8',0)+'.m3u8'
  237. # =======================================
  238. result = {}
  239. result["parse"] = xiutan
  240. result["playUrl"] = ''
  241. result["url"] = url
  242. result["header"] = headerx
  243. return result
  244. # =======================================
  245. if xiutan == 1:
  246. if len(parts) > 1:
  247. before_https, after_https = parts[0], 'http' + parts[1]
  248. result = {}
  249. result["parse"] = xiutan
  250. result["playUrl"] = ''
  251. result["url"] = after_https
  252. result["header"] = headerx
  253. return result
  254. def searchContentPage(self, key, quick, page):
  255. result = {}
  256. videos = []
  257. if not page:
  258. page = '1'
  259. if page == '1':
  260. url = f'{xurl}/?s={key}'
  261. else:
  262. url = f'{xurl}//page/{str(page)}?s={key}'
  263. detail = requests.get(url=url, headers=headerx)
  264. detail.encoding = "utf-8"
  265. res = detail.text
  266. doc = BeautifulSoup(res, "lxml")
  267. soups = doc.find_all('div', class_="bt_img")
  268. for soup in soups:
  269. vods = soup.find_all('li')
  270. for vod in vods:
  271. name = vod.find('img')['alt']
  272. ids = vod.find('h3', class_="dytit")
  273. id = ids.find('a')['href']
  274. id = id.replace('www.subaibaiys.com', 'www.subaibai.com')
  275. pic = vod.find('img')['data-original']
  276. if 'http' not in pic:
  277. pic = xurl + pic
  278. remark = self.extract_middle_text(str(vod), 'class="rating">', '</div>', 0)
  279. video = {
  280. "vod_id": id,
  281. "vod_name": name,
  282. "vod_pic": pic,
  283. "vod_remarks": remark
  284. }
  285. videos.append(video)
  286. result['list'] = videos
  287. result['page'] = page
  288. result['pagecount'] = 9999
  289. result['limit'] = 90
  290. result['total'] = 999999
  291. return result
  292. def searchContent(self, key, quick):
  293. return self.searchContentPage(key, quick, '1')
  294. def localProxy(self, params):
  295. if params['type'] == "m3u8":
  296. return self.proxyM3u8(params)
  297. elif params['type'] == "media":
  298. return self.proxyMedia(params)
  299. elif params['type'] == "ts":
  300. return self.proxyTs(params)
  301. return None
  302. """
  303. =======================================
  304. 换行 \n 零个或者多个空格 \s+ 数字型 int 文本型 str 分页{} '年代':'2021'
  305. 性能要求高"lxml" 处理不规范的HTML"html5lib" 简单应用"html.parser" 解析XML"xml"
  306. =======================================
  307. /rss/index.xml?wd=爱情&page=1 搜索有验证
  308. /index.php/ajax/suggest?mid=1&wd=爱情&page=1&limit=30 搜索有验证
  309. /index.php/ajax/data?mid=1&tid={cateId}&class={class}&area={area}&page={catePg}&limit=30 分类有验证
  310. /index.php/vod/type/class/{cid}/id/41/page/{str(page)}/year/{NdType}.html 隐藏分类
  311. /{cateId}-{area}-{by}-{class}-{lang}-{letter}---{catePg}---{year}.html
  312. 短剧 穿越 古装 仙侠 女频 恋爱 反转 现代 都市 剧情 玄幻 脑洞 悬疑
  313. =======================================
  314. aaa = self.extract_middle_text(res, 'bbb', 'ccc', 0)
  315. aaa = aaa.replace('aaa', '').replace('bbb', '') 替换多余
  316. 取头 取尾 (不循环) 截取项 (不循环) 长用于直链 二次截取 0号子程序
  317. aaa =self.extract_middle_text(res, 'bbb', 'ccc',1,'html">(.*?)<')
  318. aaa = aaa.replace('aaa', '').replace('bbb', '') 替换多余
  319. 取头 取尾 (不循环) 截取项 (循环) 长用于详情 和2号区别没有$$$ 1号子程序
  320. aaa = self.extract_middle_text(res, 'bbb','ccc', 2,'<span class=".*?" id=".*?">(.*?)</span>')
  321. aaa = aaa.replace('aaa', '').replace('bbb', '') 替换多余
  322. 取头 取尾 (不循环) 截取项 (循环) 只能用于线路数组 里面包含$$$ 2号子程序
  323. aaa = self.extract_middle_text(res, 'bbb', 'ccc', 3,'href="(.*?)" class=".*?">(.*?)</a>')
  324. aaa = aaa.replace('aaa', '').replace('bbb', '') 替换多余
  325. 取头 取尾 (循环) 截取项 (循环) 长用于播放数组 3号子程序
  326. =======================================
  327. """
  328. if __name__ == '__main__':
  329. spider_instance = Spider()
  330. # res=spider_instance.homeContent('filter') # 分类🚨
  331. # res = spider_instance.homeVideoContent() # 首页🚨
  332. # res=spider_instance.categoryContent('new-movie', 2, 'filter', {}) # 分页🚨
  333. res = spider_instance.detailContent(['https://www.subaibai.com/movie/56949.html']) # 详情页🚨
  334. # res = spider_instance.playerContent('1', '0https://www.mjzj.me/74354-1-1.html', 'vipFlags') # 播放页🚨
  335. # res = spider_instance.searchContentPage('爱情', 'quick', '2') # 搜索页🚨
  336. print(res)