博主
258
258
258
258
专辑

学习笔记20230912

亮子 2023-09-12 06:53:22 9588 0 0 0

jsoup库的使用

1、添加依赖

        <dependency>
            <!-- jsoup HTML parser library @ https://jsoup.org/ -->
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.14.3</version>
        </dependency>

        <!-- https://htmlunit.sourceforge.io/-->
        <dependency>
            <groupId>net.sourceforge.htmlunit</groupId>
            <artifactId>htmlunit</artifactId>
            <version>2.58.0</version>
        </dependency>

2、测试功能

package com.bw.pxx.user;


import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;

import java.io.IOException;

/**
 * @author 军哥
 * @version 1.0
 * @description: TestJsoupApp
 * @date 2023/9/12 9:30
 */

public class TestJsoupApp {

    /**
     * @description 测试静态页面抓取
     * @author 军哥
     * @date 2023/9/12 11:24
     * @version 1.0
     */
    @Test
    public void testJsoup() throws IOException {
        String url = "https://category.vip.com/suggest.php?keyword=%E6%89%8B%E6%9C%BA";
        Document doc = Jsoup.connect(url).get();

        System.out.println(doc);
    }

    @Test
    public void testUrl() throws IOException {
        String url = "https://category.vip.com/suggest.php?keyword=%E6%89%8B%E6%9C%BA";

        System.out.println("===============================================");

        String html = getHtml(url);
        if(html == null) {
            System.out.println("页面抓取错误。。。。。");
            System.out.println("===============================================");

            return;
        }

        Document doc = Jsoup.parse(html);

        //--1 找到产品的列表
        Elements goodsItem = doc.getElementsByClass("c-goods-item");
        goodsItem.forEach(item -> {
            System.out.println("===============================================");
//            System.out.println(item.html());
            String productName = item.getElementsByClass("c-goods-item__name").text();
            System.out.println(productName);
            System.out.println("===============================================");

        });


//        System.out.println(doc);
    }


    /**
     * @description 模拟chrome浏览器获取页面的内容
     * @author 军哥
     * @date 2023/9/12 11:36
     * @version 1.0
     */
    public String getHtml(String url) {

        try {
            // 模拟chrome浏览器
            WebClient wc = new WebClient(BrowserVersion.CHROME);

            // 启用JS解释器,默认为true
            wc.getOptions().setJavaScriptEnabled(true);

            // 禁用css支持
            wc.getOptions().setCssEnabled(false);

            // js运行错误时,是否抛出异常
            wc.getOptions().setThrowExceptionOnScriptError(false);

            // 状态码错误时,是否抛出异常
            wc.getOptions().setThrowExceptionOnFailingStatusCode(false);

            // 设置连接超时时间 ,这里是5S。如果为0,则无限期等待
            wc.getOptions().setTimeout(5000);

            // 是否允许使用ActiveX
            wc.getOptions().setActiveXNative(false);

            // 等待js时间
            wc.waitForBackgroundJavaScript(10 * 1000);

            // 设置Ajax异步处理控制器即启用Ajax支持
            wc.setAjaxController(new NicelyResynchronizingAjaxController());

            // 不跟踪抓取
            wc.getOptions().setDoNotTrackEnabled(false);

            // 访问页面
            HtmlPage page = wc.getPage(url);

            // 以xml的形式获取响应文本
            String pageXml = page.asXml();
//            System.out.println(pageXml);

            //Document document = Jsoup.parse(pageXml);
            return pageXml;
        }
        catch (Exception e) {
            e.printStackTrace();
        }

        return null;
    }

}