java如何写爬虫
Java 爬虫实现方法
使用 Jsoup 库
Jsoup 是一个轻量级的 HTML 解析库,适合简单的爬虫需求。
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupExample {
public static void main(String[] args) throws Exception {
String url = "https://example.com";
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
for (Element link : links) {
System.out.println(link.attr("abs:href"));
}
}
}
使用 HttpClient
Apache HttpClient 更适合处理复杂的 HTTP 请求。
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
public class HttpClientExample {
public static void main(String[] args) throws Exception {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet request = new HttpGet("https://example.com");
try (CloseableHttpResponse response = httpClient.execute(request)) {
String html = EntityUtils.toString(response.getEntity());
System.out.println(html);
}
}
}
使用 Selenium
对于需要处理 JavaScript 渲染的页面,Selenium 是更好的选择。
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
public class SeleniumExample {
public static void main(String[] args) {
System.setProperty("webdriver.chrome.driver", "path/to/chromedriver");
WebDriver driver = new ChromeDriver();
driver.get("https://example.com");
System.out.println(driver.getPageSource());
driver.quit();
}
}
数据存储
爬取的数据可以存储到文件或数据库中。
import java.io.FileWriter;
import java.io.IOException;
public class FileStorageExample {
public static void saveToFile(String content, String filename) throws IOException {
try (FileWriter writer = new FileWriter(filename)) {
writer.write(content);
}
}
}
处理反爬机制
应对常见的反爬措施需要额外处理。
// 设置用户代理
HttpGet request = new HttpGet("https://example.com");
request.setHeader("User-Agent", "Mozilla/5.0");
// 使用代理
HttpHost proxy = new HttpHost("proxy.example.com", 8080);
RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
request.setConfig(config);
多线程爬取
提高爬取效率可以使用多线程。
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class MultiThreadCrawler {
public static void main(String[] args) {
ExecutorService executor = Executors.newFixedThreadPool(5);
for (int i = 0; i < 10; i++) {
executor.execute(new CrawlTask("https://example.com/page" + i));
}
executor.shutdown();
}
}
class CrawlTask implements Runnable {
private String url;
public CrawlTask(String url) {
this.url = url;
}
@Override
public void run() {
// 实现爬取逻辑
}
}
以上方法涵盖了从简单到复杂的 Java 爬虫实现方案,可根据具体需求选择合适的技术组合。注意遵守目标网站的 robots.txt 协议和相关法律法规。







