Java 抓取网页中的内容【持续更新】

时间:2021-01-07 03:18:04

背景:前几天复习Java的时候看到URL类,当时就想写个小程序试试,迫于考试没有动手,今天写了下,感觉还不错

内容1. 抓取网页中的URL

知识点:Java URL+ 正则表达式

 import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern; public class URLReader {
public static void main(String[] args) throws Exception {
System.out.println("开始!");
Pattern pattern = Pattern.compile("http(s)?://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?");
URL baidu = new URL("http://www.cnblogs.com/A--Q/");
BufferedReader br = new BufferedReader(new InputStreamReader(baidu.openStream(), "utf-8"));
String inputLine;
while ((inputLine = br.readLine()) != null) {
Matcher matcher = pattern.matcher(inputLine);
while (matcher.find()) {
System.out.println(matcher.group(0));
}
}
br.close();
System.out.println("程序执行结束!");
}
}

效果:

Java 抓取网页中的内容【持续更新】

内容2. 抓取网页中的图片

 import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern; public class CatchImage { private static final String URL = "http://www.cnblogs.com/A--Q/p/5170713.html";
private static final String ECODING = "UTF-8";
private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
private static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)"; public static void main(String[] args) throws Exception {
System.out.println("start");
CatchImage cm = new CatchImage();
String HTML = cm.getHTML(URL);
List<String> imgUrl = cm.getImageUrl(HTML);
List<String> imgSrc = cm.getImageSrc(imgUrl);
cm.Download(imgSrc);
System.out.println("END");
} private String getHTML(String url) throws Exception {
URL uri = new URL(url);
URLConnection connection = uri.openConnection();
InputStream in = connection.getInputStream();
byte[] buf = new byte[1024];
int length = 0;
StringBuffer sb = new StringBuffer();
while ((length = in.read(buf, 0, buf.length)) > 0) {
sb.append(new String(buf, ECODING));
}
in.close();
return sb.toString();
} private List<String> getImageUrl(String HTML) {
Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML);
List<String> listImgUrl = new ArrayList<String>();
while (matcher.find()) {
listImgUrl.add(matcher.group());
}
return listImgUrl;
} private List<String> getImageSrc(List<String> listImageUrl) {
List<String> listImgSrc = new ArrayList<String>();
for (String image : listImageUrl) {
Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
while (matcher.find()) {
listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));
}
}
return listImgSrc;
} private void Download(List<String> listImgSrc) {
try {
for (String url : listImgSrc) {
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
URL uri = new URL(url);
InputStream in = uri.openStream();
FileOutputStream fo = new FileOutputStream(new File(imageName));
byte[] buf = new byte[1024];
int length = 0;
System.out.println("开始下载:" + url);
while ((length = in.read(buf, 0, buf.length)) != -1) {
fo.write(buf, 0, length);
}
in.close();
fo.close();
System.out.println(imageName + "下载完成");
}
} catch (Exception e) {
System.out.println("下载失败");
}
} }