常用的抓取数据的姿势
- 姿势1:HTMLParser
- 姿势2:HttpClient
- 姿势3:Jsoup
Jsoup
1、添加依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
2、代码
@Test
public void test5() throws Exception {
String url = "https://kns.cnki.net/kcms/detail/detail.aspx?dbcode=CJFD&dbname=CJFDAUTODAY&filename=SAHG202109022&uniplatform=NZKPT";
//解析资源路径,设置超时时间30秒
//此方法生成一个Document对象,可以按照前端方式进行操作
Document doc = Jsoup.parse(new URL(url), 30000);
//获取标题
Element element = doc.getElementsByClass("wx-tit").get(0);
String title = element.child(0).text();
System.out.println("标题:" + title);
//获取摘要
Element chDivSummary = doc.getElementById("ChDivSummary");
String digest = chDivSummary.text();
System.out.println("摘要:" + digest);
//获取关键字
Element keywords = doc.getElementsByClass("keywords").get(0);
int num = keywords.childNodeSize();
String keyword = "";
for (int i = 0; i < num; i++) {
keyword += keywords.child(i).text();
}
System.out.println("关键字:" + keyword);
}