1、添加依赖
<dependency>
<!-- jsoup HTML parser library @ https://jsoup.org/ -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
<!-- https://htmlunit.sourceforge.io/-->
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.58.0</version>
</dependency>
2、测试功能
package com.bw.pxx.user;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import java.io.IOException;
/**
* @author 军哥
* @version 1.0
* @description: TestJsoupApp
* @date 2023/9/12 9:30
*/
public class TestJsoupApp {
/**
* @description 测试静态页面抓取
* @author 军哥
* @date 2023/9/12 11:24
* @version 1.0
*/
@Test
public void testJsoup() throws IOException {
String url = "https://category.vip.com/suggest.php?keyword=%E6%89%8B%E6%9C%BA";
Document doc = Jsoup.connect(url).get();
System.out.println(doc);
}
@Test
public void testUrl() throws IOException {
String url = "https://category.vip.com/suggest.php?keyword=%E6%89%8B%E6%9C%BA";
System.out.println("===============================================");
String html = getHtml(url);
if(html == null) {
System.out.println("页面抓取错误。。。。。");
System.out.println("===============================================");
return;
}
Document doc = Jsoup.parse(html);
//--1 找到产品的列表
Elements goodsItem = doc.getElementsByClass("c-goods-item");
goodsItem.forEach(item -> {
System.out.println("===============================================");
// System.out.println(item.html());
String productName = item.getElementsByClass("c-goods-item__name").text();
System.out.println(productName);
System.out.println("===============================================");
});
// System.out.println(doc);
}
/**
* @description 模拟chrome浏览器获取页面的内容
* @author 军哥
* @date 2023/9/12 11:36
* @version 1.0
*/
public String getHtml(String url) {
try {
// 模拟chrome浏览器
WebClient wc = new WebClient(BrowserVersion.CHROME);
// 启用JS解释器,默认为true
wc.getOptions().setJavaScriptEnabled(true);
// 禁用css支持
wc.getOptions().setCssEnabled(false);
// js运行错误时,是否抛出异常
wc.getOptions().setThrowExceptionOnScriptError(false);
// 状态码错误时,是否抛出异常
wc.getOptions().setThrowExceptionOnFailingStatusCode(false);
// 设置连接超时时间 ,这里是5S。如果为0,则无限期等待
wc.getOptions().setTimeout(5000);
// 是否允许使用ActiveX
wc.getOptions().setActiveXNative(false);
// 等待js时间
wc.waitForBackgroundJavaScript(10 * 1000);
// 设置Ajax异步处理控制器即启用Ajax支持
wc.setAjaxController(new NicelyResynchronizingAjaxController());
// 不跟踪抓取
wc.getOptions().setDoNotTrackEnabled(false);
// 访问页面
HtmlPage page = wc.getPage(url);
// 以xml的形式获取响应文本
String pageXml = page.asXml();
// System.out.println(pageXml);
//Document document = Jsoup.parse(pageXml);
return pageXml;
}
catch (Exception e) {
e.printStackTrace();
}
return null;
}
}