使用Java爬虫得到CSDN博客信息并保存(一)

需求：

使用java爬虫得到blog.csdn.net首页的所有出现的博客地址，并逐个访问，把博客信息保存到本地。

思路：

（1）.通过URL类的方法得到首页的HTML源码，使用正则把博客的url都放到一个String数组中

（2）.再逐个访问个人博客的首页得到HTML源码，使用正则提取到需要的信息

（3）.使用IO把得到的信息保存到本地

具体实现：

这里我把程序分成了两个部分：得到博客中需要的信息、把信息保存到文件中。这篇博客先讲怎样得到博客信息。

代码实现：

package cn.test12.WebRobot04;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/*
 * 这个类就是得到cdsn的imgUrl，visits,titles,Bloger
 * 步骤：
 *    1.构造方法，传入url,得到属性urlString和webLine
 * 总结：
 *    1.set集合转String[]可以使用
 String[] strs =  set.toArray(new String[0]);
 System.out.println(Arrays.toString(strs));
 切勿使用 strs = (String[])set.toArray()会出现转换异常
 */
public class GetCsdn {
//代表访问的CSDN得URL
public String urlString = null;
//把整个网页都放到一行里面，方便正则表达式进行匹配
public String webLine = null;

// 创建对象后，这个对象中就已经有了表示这个网页的webLine
public GetCsdn(String urlString) throws Exception {
this.urlString = urlString;
// 创建链接对象
URL url = new URL(this.urlString);
// 创建根据链接对象写出通道对象
URLConnection uc = url.openConnection();
uc.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0");
// 根据通道对象，得到读的方法,并指明字节流是utf-8格式的
BufferedReader br = new BufferedReader(new InputStreamReader(
uc.getInputStream(), "utf-8"));
// 把得到的数据一行一行的交给filter()
String line = null;
StringBuffer sb = new StringBuffer();
while ((line = br.readLine()) != null) {
// 这里默认就不添加回车换行，所以保存好就是一行
sb.append(line);
}
this.webLine = new String(sb);
}

// 一开始博主的名字是通过传入的得url得到的，但是，后来需要多页的时候，就不可以了，这里采用猜得到博主的主页再进行截取
public String getBloger() {
// 这句话得到的网址是：href="http://blog.csdn.net/xuejiayue1105"
String regex = "href=\"http://blog.csdn.net/\\w+\"";
Pattern pa = Pattern.compile(regex);
Matcher ma = pa.matcher(this.webLine);
ma.find();
return ma.group().split("\"")[1].substring(21);
}

public String getWebName() {
// 这句话得到<div id="blog_title"> <h2> <a
// href="http://blog.csdn.net/wangquannetwork">WangQuanNetwork专栏</a></h2>
// <h3></h3> <div class="clear"> </div>
String regex = "<div id=\"blog_title\">.+?</div>";
// 也可以用这个表达式，这个表达式捕获组1就是网站名字了
// String regex = "<div id=\"blog_title\">.+?>(\\S+)</a>.+</div>";
Pattern pa = Pattern.compile(regex);
Matcher ma = pa.matcher(this.webLine);
ma.find();
String temp = ma.group();
String tempRegex = "<.+?>";
return temp.replaceAll(tempRegex, "").trim();

}

public String[] getVisits() {
// 2.写出正则对象
String regex = "<li>[\u4E00-\u9FA5]+.+?</li>";
Pattern pa = Pattern.compile(regex);
// 3.处理数据得到引擎对象
Matcher matcher = pa.matcher(this.webLine);
// 4.遍历引擎对象
HashSet<String> visits = new HashSet<>();
while (matcher.find()) {
visits.add(matcher.group().replaceAll("<.+?>", ""));
}
// 返回数组类型的数据
return visits.toArray(new String[0]);
}

public String[] getTitles() throws Exception {
// 2.编写正则对象
String tempRegex = "[0-9]+\">尾页";
Pattern tempPa = Pattern.compile(tempRegex);
// 3.使用正则对象得到引擎
Matcher tempMatcher = tempPa.matcher(this.webLine);
tempMatcher.find();
// 4.得到一共有多少页
int count = Integer.parseInt(tempMatcher.group().split("\"")[0]);
// 5.这里需求是希望可以让先捕获标题在前面，所以这里使用可以排序的TreeSet集合，实现了比较器接口
TreeSet<String> titles = new TreeSet<>(new Comparator<String>() {
public int compare(String o1, String o2) {
// 这里永远都返回1，表示后来的永远都在后面
return 1;
}
});
// 5.根据页数常见循环，每个循环中都是自己的组成的一个链接，调用getCsdn类中的方法得到html内容，然后再判断，得到title
for (int i = 1; i <= count; i++) {
String onePageLine = new GetCsdn(this.urlString + "/article/list/"
+ i).webLine;
// 2.编写正则对象
String regex = "<span class=\"link_title\">.+?</a></span>";
Pattern pa = Pattern.compile(regex);
// 3.使用正则对象得到引擎
Matcher matcher = pa.matcher(onePageLine);
// 4.输出的引擎中的内容,首先把标签都去掉，然后使用String自带的trim()方法把标题旁边的空格都去掉
// <span class="link_title"><a
// href="/qq_20607829/article/details/47747947"> 《学习记录》Toast带图片的显示
// </a></span>
while (matcher.find()) {
titles.add(matcher.group().replaceAll("<.+?>", "").trim());
}
}
return titles.toArray(new String[0]);
}

public String getImgUrl() {
// 2.写出正则对象
// 这是早期使用的正则，但是因为如果博主有了专栏，就会截取不正常
// String regex = "<img src=\"http://avatar.csdn.net.+?/>";
String regex = "<img src=\"http://avatar.csdn.net/\\w{1}/.+?/>";
Pattern pa = Pattern.compile(regex);
// 3.处理数据得到引擎对象
Matcher matcher = pa.matcher(this.webLine);
// 4.遍历引擎对象
matcher.find();
// <img src="http://avatar.csdn.net/0/8/9/1_qq_20607829.jpg"
// title="访问我的空间" style="max-width:90%"/>
// 首先得到还有无用信息的链接，然后使用"进行切割，获取到第二段，就是需要的链接
return matcher.group().split("\"")[1];
}

public String[] getBlogHomePage() {
// 这句话得到的网址是：href="http://blog.csdn.net/wangquanjava"
String regex = "href=\"http://blog.csdn.net/\\w+\"";
Pattern pa = Pattern.compile(regex);
Matcher ma = pa.matcher(this.webLine);
// 使用HashSet集合的唯一性，把相同的链接去掉，但是这时链接中还有没用的部分
HashSet<String> tempLinks = new HashSet<>();
while (ma.find()) {
tempLinks.add(ma.group());
}
// 新建一个集合用来放真正的链接
HashSet<String> links = new HashSet<>();
for (String link : tempLinks) {
// 对之前的连接使用"进行截取，取其第二部分
link = link.split("\"")[1];
links.add(link);
}
return links.toArray(new String[0]);
}

public static void main(String[] args) throws Exception {
GetCsdn gc = new GetCsdn("http://blog.csdn.net/wangquannetwork");
System.out.println(gc.getBloger());
System.out.println("----------------------------");
System.out.println(gc.getImgUrl());
System.out.println("----------------------------");
System.out.println(gc.getWebName());
System.out.println("----------------------------");
System.out.println(Arrays.toString(gc.getTitles()));
System.out.println("----------------------------");
System.out.println(Arrays.toString(gc.getVisits()));
System.out.println("----------------------------");

}
}

5.显示结果

使用Java爬虫得到CSDN博客信息并保存(一)

秒客网

使用Java爬虫得到CSDN博客信息并保存(一)

相关文章