123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299 |
- package com.github.tvbox.osc.util.js;
- import android.text.TextUtils;
- import com.quickjs.android.JSUtils;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class HtmlParser {
- private static String pdfh_html = "";
- private static String pdfa_html = "";
- private static final Pattern p = Pattern.compile("url\\((.*?)\\)", Pattern.MULTILINE | Pattern.DOTALL);
- private static final Pattern NOADD_INDEX = Pattern.compile(":eq|:lt|:gt|:first|:last|^body$|^#"); // 不自动加eq下标索引
- private static final Pattern URLJOIN_ATTR = Pattern.compile("(url|src|href|-original|-src|-play|-url|style)$", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 需要自动urljoin的属性
- private static final Pattern SPECIAL_URL = Pattern.compile("^(ftp|magnet|thunder|ws):", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE); // 过滤特殊链接,不走urlJoin
- private static Document pdfh_doc = null;
- private static Document pdfa_doc = null;
- public static String joinUrl(String parent, String child) {
- if (JSUtils.isEmpty(parent)) {
- return child;
- }
- URL url;
- String q = parent;
- try {
- url = new URL(new URL(parent), child);
- q = url.toExternalForm();
- } catch (MalformedURLException e) {
- e.printStackTrace();
- }
- // if (q.contains("#")) {
- // q = q.replaceAll("^(.+?)#.*?$", "$1");
- // }
- return q;
- }
- public static class Painfo {
- public String nparse_rule;
- public int nparse_index;
- public List<String> excludes;
- }
- private static Painfo getParseInfo(String nparse) {
- /*
- 根据传入的单规则获取 parse规则,索引位置,排除列表 -- 可以用于剔除元素,支持多个,按标签剔除,按id剔除等操作
- :param nparse:
- :return:*/
- Painfo painfo = new Painfo();
- //List<String> excludes = new ArrayList<>(); //定义排除列表默认值为空
- //int nparse_index; //定义位置索引默认值为0
- painfo.nparse_rule = nparse; //定义规则默认值为本身
- if (nparse.contains(":eq")) {
- painfo.nparse_rule = nparse.split(":")[0];
- String nparse_pos = nparse.split(":")[1];
- if (painfo.nparse_rule.contains("--")) {
- String[] rules = painfo.nparse_rule.split("--");
- painfo.excludes = new ArrayList<>(Arrays.asList(rules));
- painfo.excludes.remove(0);
- painfo.nparse_rule = rules[0];
- } else if (nparse_pos.contains("--")) {
- String[] rules = nparse_pos.split("--");
- painfo.excludes = new ArrayList<>(Arrays.asList(rules));
- painfo.excludes.remove(0);
- nparse_pos = rules[0];
- }
- try {
- painfo.nparse_index = Integer.parseInt(nparse_pos.replace("eq(", "").replace(")", ""));
- } catch (Exception e1) {
- painfo.nparse_index = 0;
- }
- } else {
- if (nparse.contains("--")) {
- String[] rules = painfo.nparse_rule.split("--");
- painfo.excludes = new ArrayList<>(Arrays.asList(rules));
- painfo.excludes.remove(0);
- painfo.nparse_rule = rules[0];
- }
- }
- return painfo;
- }
- public static boolean isIndex(String str) {
- if (JSUtils.isEmpty(str)) {
- return false;
- }
- for (String str2 : new String[]{":eq", ":lt", ":gt", ":first", ":last", "body", "#"}) {
- if (str.contains(str2)) {
- if (str2.equals("body") || str2.equals("#")) {
- return str.startsWith(str2);
- }
- return true;
- }
- }
- return false;
- }
- public static boolean isUrl(String str) {
- if (JSUtils.isEmpty(str)) {
- return false;
- }
- for (String str2 : new String[]{"url", "src", "href", "-original", "-play"}) {
- if (str.contains(str2)) {
- return true;
- }
- }
- return false;
- }
- private static String parseHikerToJq(String parse, boolean first) {
- /*
- 海阔解析表达式转原生表达式,自动补eq,如果传了first就最后一个也取eq(0)
- :param parse:
- :param first:
- :return:
- */
- // 不自动加eq下标索引
- if (parse.contains("&&")) {
- String[] parses = parse.split("&&"); //带&&的重新拼接
- List<String> new_parses = new ArrayList<>(); //构造新的解析表达式列表
- for (int i = 0; i < parses.length; i++) {
- String[] pss = parses[i].split(" ");
- String ps = pss[pss.length - 1]; //如果分割&&后带空格就取最后一个元素
- Matcher m = NOADD_INDEX.matcher(ps);
- //if (!isIndex(ps)) {
- if (!m.find()) {
- if (!first && i >= parses.length - 1) { //不传first且遇到最后一个,不用补eq(0)
- new_parses.add(parses[i]);
- } else {
- new_parses.add(parses[i] + ":eq(0)");
- }
- } else {
- new_parses.add(parses[i]);
- }
- }
- parse = TextUtils.join(" ", new_parses);
- } else {
- String[] pss = parse.split(" ");
- String ps = pss[pss.length - 1]; //如果分割&&后带空格就取最后一个元素
- Matcher m = NOADD_INDEX.matcher(ps);
- //if (!isIndex(ps) && first) {
- if (!m.find() && first) {
- parse = parse + ":eq(0)";
- }
- }
- return parse;
- }
- public static String parseDomForUrl(String html, String rule, String add_url) {
- if (!pdfh_html.equals(html)) {
- pdfh_html = html;
- pdfh_doc = Jsoup.parse(html);
- }
- Document doc = pdfh_doc;
- if (rule.equals("body&&Text") || rule.equals("Text")) {
- return doc.text();
- } else if (rule.equals("body&&Html") || rule.equals("Html")) {
- return doc.html();
- }
- String option = "";
- if (rule.contains("&&")) {
- String[] rs = rule.split("&&");
- option = rs[rs.length - 1];
- List<String> excludes = new ArrayList<>(Arrays.asList(rs));
- excludes.remove(rs.length - 1);
- rule = TextUtils.join("&&", excludes);
- }
- rule = parseHikerToJq(rule, true);
- String[] parses = rule.split(" ");
- Elements ret = new Elements();
- for (String nparse : parses) {
- ret = parseOneRule(doc, nparse, ret);
- if (ret.isEmpty()) {
- return "";
- }
- }
- String result;
- if (JSUtils.isNotEmpty(option)) {
- if (option.equals("Text")) {
- result = ret.text();
- } else if (option.equals("Html")) {
- result = ret.html();
- } else {
- result = ret.attr(option);
- if (option.toLowerCase().contains("style") && result.contains("url(")) {
- Matcher m = p.matcher(result);
- if (m.find()) {
- result = m.group(1);
- }
- // 2023/07/28新增 style取内部链接自动去除首尾单双引号
- result = result.replaceAll("^['|\"](.*)['|\"]$", "$1");
- }
- if (JSUtils.isNotEmpty(result) && JSUtils.isNotEmpty(add_url)) {
- // 需要自动urljoin的属性
- Matcher m = URLJOIN_ATTR.matcher(option);
- Matcher n = SPECIAL_URL.matcher(result);
- //if (isUrl(option)) {
- if (m.find() && !n.find()){
- if (result.contains("http")) {
- result = result.substring(result.indexOf("http"));
- } else {
- result = joinUrl(add_url, result);
- }
- }
- }
- }
- } else {
- result = ret.outerHtml();
- }
- return result;
- }
- public static List<String> parseDomForArray(String html, String rule) {
- if (!pdfa_html.equals(html)) {
- pdfa_html = html;
- pdfa_doc = Jsoup.parse(html);
- }
- Document doc = pdfa_doc;
- rule = parseHikerToJq(rule, false);
- String[] parses = rule.split(" ");
- Elements ret = new Elements();
- for (String pars : parses) {
- ret = parseOneRule(doc, pars, ret);
- if (ret.isEmpty()) {
- return new ArrayList<>();
- }
- }
- List<String> eleHtml = new ArrayList<>();
- for (int i = 0; i < ret.size(); i++) {
- Element element1 = ret.get(i);
- eleHtml.add(element1.outerHtml());
- }
- return eleHtml;
- }
- private static Elements parseOneRule(Document doc, String nparse, Elements ret) {
- Painfo painfo = getParseInfo(nparse);
- if (ret.isEmpty()) {
- ret = doc.select(painfo.nparse_rule);
- } else {
- ret = ret.select(painfo.nparse_rule);
- }
- if (nparse.contains(":eq")) {
- if(painfo.nparse_index < 0){
- ret = ret.eq(ret.size() + painfo.nparse_index);
- } else {
- ret = ret.eq(painfo.nparse_index);
- }
- }
- if (painfo.excludes != null && !ret.isEmpty()) {
- ret = ret.clone(); //克隆一个, 免得直接remove会影响doc的缓存
- for (int i = 0; i < painfo.excludes.size(); i++) {
- ret.select(painfo.excludes.get(i)).remove();
- }
- }
- return ret;
- }
-
- public static List<String> parseDomForList(String html, String p1, String list_text, String list_url, String add_url) {
- if (!pdfa_html.equals(html)) {
- pdfa_html = html;
- pdfa_doc = Jsoup.parse(html);
- }
- Document doc = pdfa_doc;
- p1 = parseHikerToJq(p1, false);
- String[] parses = p1.split(" ");
- Elements ret = new Elements();
- for (String pars : parses) {
- ret = parseOneRule(doc, pars, ret);
- if (ret.isEmpty()) {
- return new ArrayList<>();
- }
- }
- List<String> new_vod_list = new ArrayList<>();
- for(int i = 0; i < ret.size(); i++){
- String it = ret.get(i).outerHtml();
- new_vod_list.add(parseDomForUrl(it, list_text, "").trim() + '$' + parseDomForUrl(it, list_url, add_url));
- }
- return new_vod_list;
- }
- }
|