URLParser.java 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import java.io.BufferedWriter;
  2. import java.io.FileWriter;
  3. import java.io.IOException;
  4. import java.io.InputStream;
  5. import java.net.HttpURLConnection;
  6. import java.net.MalformedURLException;
  7. import java.net.URL;
  8. import java.util.*;
  9. import java.util.concurrent.TimeUnit;
  10. public class URLParser {
  11. private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36";
  12. private static final int TIMEOUT = 5000; // 5 seconds
  13. public static List<String> getParseHrefResult(String name, String href, Set<String> valids, BufferedWriter f) throws IOException {
  14. List<String> invalid = new ArrayList<>();
  15. List<String> valid = new ArrayList<>();
  16. long startTime = System.nanoTime();
  17. try {
  18. URL url = new URL(href);
  19. String netloc = url.getHost();
  20. if (!valids.contains(netloc)) {
  21. HttpURLConnection connection = (HttpURLConnection) url.openConnection();
  22. connection.setRequestMethod("GET");
  23. connection.setRequestProperty("User-Agent", USER_AGENT);
  24. connection.setConnectTimeout(TIMEOUT);
  25. connection.setReadTimeout(TIMEOUT);
  26. int responseCode = connection.getResponseCode();
  27. if (responseCode == 200) {
  28. try (InputStream inputStream = connection.getInputStream()) {
  29. // Read a small amount of data to confirm validity
  30. byte[] buffer = new byte[1024];
  31. int bytesRead = inputStream.read(buffer);
  32. if (bytesRead != -1) {
  33. valid.add(netloc);
  34. long elapsedTime = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime);
  35. System.out.println(String.format("%.2f\t%s", elapsedTime / 1000.0, name));
  36. String content = name + "," + href + "\n";
  37. f.write(content);
  38. }
  39. }
  40. }
  41. } else {
  42. long elapsedTime = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime);
  43. System.out.println(String.format("%.2f\t%s +", elapsedTime / 1000.0, name));
  44. String content = name + "," + href + "\n";
  45. f.write(content);
  46. }
  47. } catch (IOException e) {
  48. try {
  49. URL url = new URL(href);
  50. String netloc = url.getHost();
  51. invalid.add(netloc);
  52. System.out.println("[无效] " + name);
  53. } catch (MalformedURLException ex) {
  54. // Handle malformed URL if needed
  55. invalid.add("malformed_url");
  56. }
  57. }
  58. return invalid;
  59. }
  60. public static void main(String[] args) {
  61. // Example usage
  62. Set<String> valids = new HashSet<>(Arrays.asList("http://肥猫.com"));
  63. try (BufferedWriter f = new BufferedWriter(new FileWriter("output.txt"))) {
  64. List<String> invalid = getParseHrefResult("testName", "http://肥猫.com", valids, f);
  65. System.out.println("Invalid URLs: " + invalid);
  66. } catch (IOException e) {
  67. e.printStackTrace();
  68. }
  69. }
  70. }