123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- import java.io.BufferedWriter;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.io.InputStream;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.util.*;
- import java.util.concurrent.TimeUnit;
- public class URLParser {
- private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36";
- private static final int TIMEOUT = 5000; // 5 seconds
- public static List<String> getParseHrefResult(String name, String href, Set<String> valids, BufferedWriter f) throws IOException {
- List<String> invalid = new ArrayList<>();
- List<String> valid = new ArrayList<>();
- long startTime = System.nanoTime();
- try {
- URL url = new URL(href);
- String netloc = url.getHost();
- if (!valids.contains(netloc)) {
- HttpURLConnection connection = (HttpURLConnection) url.openConnection();
- connection.setRequestMethod("GET");
- connection.setRequestProperty("User-Agent", USER_AGENT);
- connection.setConnectTimeout(TIMEOUT);
- connection.setReadTimeout(TIMEOUT);
- int responseCode = connection.getResponseCode();
- if (responseCode == 200) {
- try (InputStream inputStream = connection.getInputStream()) {
- // Read a small amount of data to confirm validity
- byte[] buffer = new byte[1024];
- int bytesRead = inputStream.read(buffer);
- if (bytesRead != -1) {
- valid.add(netloc);
- long elapsedTime = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime);
- System.out.println(String.format("%.2f\t%s", elapsedTime / 1000.0, name));
- String content = name + "," + href + "\n";
- f.write(content);
- }
- }
- }
- } else {
- long elapsedTime = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTime);
- System.out.println(String.format("%.2f\t%s +", elapsedTime / 1000.0, name));
- String content = name + "," + href + "\n";
- f.write(content);
- }
- } catch (IOException e) {
- try {
- URL url = new URL(href);
- String netloc = url.getHost();
- invalid.add(netloc);
- System.out.println("[无效] " + name);
- } catch (MalformedURLException ex) {
- // Handle malformed URL if needed
- invalid.add("malformed_url");
- }
- }
- return invalid;
- }
- public static void main(String[] args) {
- // Example usage
- Set<String> valids = new HashSet<>(Arrays.asList("http://肥猫.com"));
- try (BufferedWriter f = new BufferedWriter(new FileWriter("output.txt"))) {
- List<String> invalid = getParseHrefResult("testName", "http://肥猫.com", valids, f);
- System.out.println("Invalid URLs: " + invalid);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
|