HtmlParser.java 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. package com.github.tvbox.osc.util.js;
  2. import android.text.TextUtils;
  3. import com.quickjs.android.JSUtils;
  4. import org.jsoup.Jsoup;
  5. import org.jsoup.nodes.Document;
  6. import org.jsoup.nodes.Element;
  7. import org.jsoup.select.Elements;
  8. import java.net.MalformedURLException;
  9. import java.net.URL;
  10. import java.util.ArrayList;
  11. import java.util.Arrays;
  12. import java.util.List;
  13. import java.util.regex.Matcher;
  14. import java.util.regex.Pattern;
  15. public class HtmlParser {
  16. private static String pdfh_html = "";
  17. private static String pdfa_html = "";
  18. private static final Pattern p = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL);
  19. private static final Pattern NOADD_INDEX = Pattern.compile(":eq|:lt|:gt|:first|:last|^body$|^#"); // 不自动加eq下标索引
  20. private static final Pattern URLJOIN_ATTR = Pattern.compile("(url|src|href|-original|-src|-play|-url|style)$", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 需要自动urljoin的属性
  21. private static final Pattern SPECIAL_URL = Pattern.compile("^(ftp|magnet|thunder|ws):", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 过滤特殊链接,不走urlJoin
  22. private static Document pdfh_doc = null;
  23. private static Document pdfa_doc = null;
  24. public static String joinUrl(String parent, String child) {
  25. if (JSUtils.isEmpty(parent)) {
  26. return child;
  27. }
  28. URL url;
  29. String q = parent;
  30. try {
  31. url = new URL(new URL(parent), child);
  32. q = url.toExternalForm();
  33. } catch (MalformedURLException e) {
  34. e.printStackTrace();
  35. }
  36. // if (q.contains("#")) {
  37. // q = q.replaceAll("^(.+?)#.*?$", "$1");
  38. // }
  39. return q;
  40. }
  41. public static class Painfo {
  42. public String nparse_rule;
  43. public int nparse_index;
  44. public List<String> excludes;
  45. }
  46. private static Painfo getParseInfo(String nparse) {
  47. /*
  48. 根据传入的单规则获取 parse规则,索引位置,排除列表 -- 可以用于剔除元素,支持多个,按标签剔除,按id剔除等操作
  49. :param nparse:
  50. :return:*/
  51. Painfo painfo = new Painfo();
  52. //List<String> excludes = new ArrayList<>(); //定义排除列表默认值为空
  53. //int nparse_index; //定义位置索引默认值为0
  54. painfo.nparse_rule = nparse; //定义规则默认值为本身
  55. if (nparse.contains(":eq")) {
  56. painfo.nparse_rule = nparse.split(":")[0];
  57. String nparse_pos = nparse.split(":")[1];
  58. if (painfo.nparse_rule.contains("--")) {
  59. String[] rules = painfo.nparse_rule.split("--");
  60. painfo.excludes = new ArrayList<>(Arrays.asList(rules));
  61. painfo.excludes.remove(0);
  62. painfo.nparse_rule = rules[0];
  63. } else if (nparse_pos.contains("--")) {
  64. String[] rules = nparse_pos.split("--");
  65. painfo.excludes = new ArrayList<>(Arrays.asList(rules));
  66. painfo.excludes.remove(0);
  67. nparse_pos = rules[0];
  68. }
  69. try {
  70. painfo.nparse_index = Integer.parseInt(nparse_pos.replace("eq(", "").replace(")", ""));
  71. } catch (Exception e1) {
  72. painfo.nparse_index = 0;
  73. }
  74. } else {
  75. if (nparse.contains("--")) {
  76. String[] rules = painfo.nparse_rule.split("--");
  77. painfo.excludes = new ArrayList<>(Arrays.asList(rules));
  78. painfo.excludes.remove(0);
  79. painfo.nparse_rule = rules[0];
  80. }
  81. }
  82. return painfo;
  83. }
  84. public static boolean isIndex(String str) {
  85. if (JSUtils.isEmpty(str)) {
  86. return false;
  87. }
  88. for (String str2 : new String[]{":eq", ":lt", ":gt", ":first", ":last", "body", "#"}) {
  89. if (str.contains(str2)) {
  90. if (str2.equals("body") || str2.equals("#")) {
  91. return str.startsWith(str2);
  92. }
  93. return true;
  94. }
  95. }
  96. return false;
  97. }
  98. public static boolean isUrl(String str) {
  99. if (JSUtils.isEmpty(str)) {
  100. return false;
  101. }
  102. for (String str2 : new String[]{"url", "src", "href", "-original", "-play"}) {
  103. if (str.contains(str2)) {
  104. return true;
  105. }
  106. }
  107. return false;
  108. }
  109. private static String parseHikerToJq(String parse, boolean first) {
  110. /*
  111. 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
  112. :param parse:
  113. :param first:
  114. :return:
  115. */
  116. // 不自动加eq下标索引
  117. if (parse.contains("&&")) {
  118. String[] parses = parse.split("&&"); //带&&的重新拼接
  119. List<String> new_parses = new ArrayList<>(); //构造新的解析表达式列表
  120. for (int i = 0; i < parses.length; i++) {
  121. String[] pss = parses[i].split(" ");
  122. String ps = pss[pss.length - 1]; //如果分割&&后带空格就取最后一个元素
  123. Matcher m = NOADD_INDEX.matcher(ps);
  124. //if (!isIndex(ps)) {
  125. if (!m.find()) {
  126. if (!first && i >= parses.length - 1) { //不传first且遇到最后一个,不用补eq(0)
  127. new_parses.add(parses[i]);
  128. } else {
  129. new_parses.add(parses[i] + ":eq(0)");
  130. }
  131. } else {
  132. new_parses.add(parses[i]);
  133. }
  134. }
  135. parse = TextUtils.join(" ", new_parses);
  136. } else {
  137. String[] pss = parse.split(" ");
  138. String ps = pss[pss.length - 1]; //如果分割&&后带空格就取最后一个元素
  139. Matcher m = NOADD_INDEX.matcher(ps);
  140. //if (!isIndex(ps) && first) {
  141. if (!m.find() && first) {
  142. parse = parse + ":eq(0)";
  143. }
  144. }
  145. return parse;
  146. }
  147. public static String parseDomForUrl(String html, String rule, String add_url) {
  148. if (!pdfh_html.equals(html)) {
  149. pdfh_html = html;
  150. pdfh_doc = Jsoup.parse(html);
  151. }
  152. Document doc = pdfh_doc;
  153. if (rule.equals("body&&Text") || rule.equals("Text")) {
  154. return doc.text();
  155. } else if (rule.equals("body&&Html") || rule.equals("Html")) {
  156. return doc.html();
  157. }
  158. String option = "";
  159. if (rule.contains("&&")) {
  160. String[] rs = rule.split("&&");
  161. option = rs[rs.length - 1];
  162. List<String> excludes = new ArrayList<>(Arrays.asList(rs));
  163. excludes.remove(rs.length - 1);
  164. rule = TextUtils.join("&&", excludes);
  165. }
  166. rule = parseHikerToJq(rule, true);
  167. String[] parses = rule.split(" ");
  168. Elements ret = new Elements();
  169. for (String nparse : parses) {
  170. ret = parseOneRule(doc, nparse, ret);
  171. if (ret.isEmpty()) {
  172. return "";
  173. }
  174. }
  175. String result;
  176. if (JSUtils.isNotEmpty(option)) {
  177. if (option.equals("Text")) {
  178. result = ret.text();
  179. } else if (option.equals("Html")) {
  180. result = ret.html();
  181. } else {
  182. result = ret.attr(option);
  183. if (option.toLowerCase().contains("style") && result.contains("url(")) {
  184. Matcher m = p.matcher(result);
  185. if (m.find()) {
  186. result = m.group(1);
  187. }
  188. // 2023/07/28新增 style取内部链接自动去除首尾单双引号
  189. result = result.replaceAll("^['|\"](.*)['|\"]$", "$1");
  190. }
  191. if (JSUtils.isNotEmpty(result) && JSUtils.isNotEmpty(add_url)) {
  192. // 需要自动urljoin的属性
  193. Matcher m = URLJOIN_ATTR.matcher(option);
  194. Matcher n = SPECIAL_URL.matcher(result);
  195. //if (isUrl(option)) {
  196. if (m.find() && !n.find()){
  197. if (result.contains("http")) {
  198. result = result.substring(result.indexOf("http"));
  199. } else {
  200. result = joinUrl(add_url, result);
  201. }
  202. }
  203. }
  204. }
  205. } else {
  206. result = ret.outerHtml();
  207. }
  208. return result;
  209. }
  210. public static List<String> parseDomForArray(String html, String rule) {
  211. if (!pdfa_html.equals(html)) {
  212. pdfa_html = html;
  213. pdfa_doc = Jsoup.parse(html);
  214. }
  215. Document doc = pdfa_doc;
  216. rule = parseHikerToJq(rule, false);
  217. String[] parses = rule.split(" ");
  218. Elements ret = new Elements();
  219. for (String pars : parses) {
  220. ret = parseOneRule(doc, pars, ret);
  221. if (ret.isEmpty()) {
  222. return new ArrayList<>();
  223. }
  224. }
  225. List<String> eleHtml = new ArrayList<>();
  226. for (int i = 0; i < ret.size(); i++) {
  227. Element element1 = ret.get(i);
  228. eleHtml.add(element1.outerHtml());
  229. }
  230. return eleHtml;
  231. }
  232. private static Elements parseOneRule(Document doc, String nparse, Elements ret) {
  233. Painfo painfo = getParseInfo(nparse);
  234. if (ret.isEmpty()) {
  235. ret = doc.select(painfo.nparse_rule);
  236. } else {
  237. ret = ret.select(painfo.nparse_rule);
  238. }
  239. if (nparse.contains(":eq")) {
  240. if(painfo.nparse_index < 0){
  241. ret = ret.eq(ret.size() + painfo.nparse_index);
  242. } else {
  243. ret = ret.eq(painfo.nparse_index);
  244. }
  245. }
  246. if (painfo.excludes != null && !ret.isEmpty()) {
  247. ret = ret.clone(); //克隆一个, 免得直接remove会影响doc的缓存
  248. for (int i = 0; i < painfo.excludes.size(); i++) {
  249. ret.select(painfo.excludes.get(i)).remove();
  250. }
  251. }
  252. return ret;
  253. }
  254. public static List<String> parseDomForList(String html, String p1, String list_text, String list_url, String add_url) {
  255. if (!pdfa_html.equals(html)) {
  256. pdfa_html = html;
  257. pdfa_doc = Jsoup.parse(html);
  258. }
  259. Document doc = pdfa_doc;
  260. p1 = parseHikerToJq(p1, false);
  261. String[] parses = p1.split(" ");
  262. Elements ret = new Elements();
  263. for (String pars : parses) {
  264. ret = parseOneRule(doc, pars, ret);
  265. if (ret.isEmpty()) {
  266. return new ArrayList<>();
  267. }
  268. }
  269. List<String> new_vod_list = new ArrayList<>();
  270. for(int i = 0; i < ret.size(); i++){
  271. String it = ret.get(i).outerHtml();
  272. new_vod_list.add(parseDomForUrl(it, list_text, "").trim() + '$' + parseDomForUrl(it, list_url, add_url));
  273. }
  274. return new_vod_list;
  275. }
  276. }