--------------2018/12/11 update-------------------
爬取的网站已经关闭,此爬虫已经失效,代码也不会再维护,不建议学习此代码等设计风格。写的实在挺烂(摔!
感兴趣的同学浏览下就好。
-----2017-5-22再更新-----------------
现在爬虫好像出问题了。。。。。。。。。好像很多同学都对这个感兴趣,那我就放到GitHub上维护吧:/qq1367212627/youmziSpider 感兴趣的可以去这个地址看
---------2016-----------------
花了一天左右的时间,用Java写了一个图片爬取,理论上是可以将所有的图片爬下的,但是整个站点图片太多了,所以只测试的爬了9000张左右的图片。好啦先看效果图。
接下来是代码,这个简单的小爬虫是基于httpclient,所以大家使用代码,还要记得下载依赖库才可以运行(依赖库下载地址:Apache HttpComponents),网页解析使用正则解析的,还是比较简单的小爬虫。
以下,代码(代码风格很混乱,请谅解):
主程序入口:Main
import ;
import ;
import ;
public class Main {
public static Set<String> set =null;
public static void main(String[] args) {
("D:\\youmzi"); //图片保存路径
set = new HashSet<>();
ArrayList<String> Page_Link = new ArrayList<>();
ArrayList<PictMsg> Pict_Link =new ArrayList<>();
Page_Link.add("/xg/");
Page_Link.add("/");
Page_Link.add("/");// gif图
Page_Link.add("/");
Page_Link.add("/");
Page_Link.add("/");
Page_Link.add("/");
Page_Link.add("/");
Page_Link.add("/");
while(Page_Link.size()>0){
String url=Page_Link.get(0);
Find_Link.Add_Page_Link(url,Page_Link);
Find_Link.Add_Pict_Link(url,Pict_Link);
(Pict_Link);
Page_Link.remove(0);
}
}
}
import ;
import ;
import ;
/**
* Created by lewis on 2016/10/20.
*/
public class Find_Link {
public static boolean Add_Page_Link(String Context, ArrayList<String> Page_link) {
String link=null;
String fa="<a href=(['\"]?)(?!http)((?!js|css)[^\"' \\r\\n])+\\1>下一页";
Pattern r= (fa);
Matcher m = ((Context));
if ((0)) {
link = ();
String pa = "<a href='(.+?)'>下一页";
r = (pa);
m = (link);
if ((0)) {
link = (1);
if (!("#") && link != null&&!(link)) {
(link);
Page_link.add("/" + link); //获得捕获组1,一共2个组,被匹配的字符算一个组
}
}
}
return (0)&&(!("#"))&&link!=null;
}
public static void Add_Pict_Link(String Context,ArrayList<PictMsg> Pict_link) {
String pa;
Pattern r;
Matcher m ;
pa="<a href=\"(.+?)\" title=\"(.+?)\" target=\"_blank\">(.+?)<\\/a>";
r= (pa);
m = ((Context));
while(()) {
String url=(1);
String head=(2);
if(!(url)){
Pict_link.add(new PictMsg(url,head));
(url);
}
}
}
}
/**
* Created by lewis on 2016/10/21.
*/
public class PictMsg {
private String url;
private String headline;
public PictMsg(String url, String headline) {
= url;
= headline;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
= url;
}
public String getHeadline() {
return headline;
}
public void setHeadline(String headline) {
= headline;
}
@Override
public String toString() {
return "网址:"+url+"标题:"+headline;
}
}
import ;
import ;
import ;
import ;
import ;
import .*;
import ;
import ;
import ;
/**
* Created by lewis on 2016/10/20.
*/
public class DownLoad {
public static CloseableHttpClient httpClient = ().build();
public static String downloadHtml(String url) {
CloseableHttpResponse response = null;
BufferedReader br=null;
HttpGet httpGet = new HttpGet(url);
try {
response = (httpGet);
HttpEntity entity = ();
InputStreamReader isr = new InputStreamReader((),"gb2312");
StringBuilder stringBuilder =new StringBuilder();
br =new BufferedReader(isr);
String line =null;
while((line=())!=null){
(line+'\n');
}
return ();
} catch (IOException e) {
();
}finally {
if(br!=null){
try {
();
} catch (IOException e) {
();
}
}
}
return null;
}
public static void downloadPict(PictMsg pictMsg,int count) {
String url=();
CloseableHttpResponse response;
OutputStream out = null;
InputStream in=null;
BufferedReader br=null;
byte buffer[] = new byte[1024];
if(url!=null){
try {
HttpGet httpGet = new HttpGet(url);
response = (httpGet);
HttpEntity entity = ();
in = ();
CreateDir("D:\\youmzi"++());
String suffix;
if((()-1)=='g') {
suffix=".jpg";
}
else{
suffix=".gif";
}
("正在下载:"+"D:\\youmzi"++()++count+suffix+":");
out = new FileOutputStream(new File("D:\\youmzi"++()++count+suffix));
int index=0;
while((index=(buffer))!=-1){
(buffer,0,index);
}
();
} catch (IOException e) {
();
}finally {
try {
if (br!=null){
();
}
if(out!=null){
();
}
if(in!=null){
();
}
} catch (IOException e) {
();
}
}
}
}
public static void downloadPict(ArrayList<PictMsg> Pict_link){
for(int i = 0;i< Pict_link.size();i++){
// (Pict_link.get(i));
if(Pict_link.get(i)!=null)
DownLoad_All_PictSoruce(Pict_link.get(i));
}
Pict_link.clear();
}
public static void CreateDir(String dir){
File file = new File(dir);
if(!()){
();
}
}
public static void DownLoad_All_PictSoruce(PictMsg pictMsg){
ArrayList<String> All_Pict_Soruce = new ArrayList<>();
String url =();
All_Pict_Soruce.add(url);
while(Find_Link.Add_Page_Link(url,All_Pict_Soruce)){ //通过循环一直找到最后一个页面
url=All_Pict_Soruce.get(All_Pict_Soruce.size()-1);
}
for(int i =0;i<All_Pict_Soruce.size();i++){
//(Pict_down_Soruce(All_Pict_Soruce.get(i)));
if(All_Pict_Soruce.get(i)!=null){
String link=Pict_down_Soruce(All_Pict_Soruce.get(i));
if(!(link)) {
downloadPict(new PictMsg(link, ()), i);
("一共有:"+All_Pict_Soruce.size()+","+"还剩下:"+(All_Pict_Soruce.size()-i));
(link);
}
}
}
All_Pict_Soruce.clear();
}
public static String Pict_down_Soruce(String url){
String context = (url);
String pa;
Pattern r;
Matcher m ;
pa="<img src='(.+?)' alt=";
r= (pa);
m = (context);
if((0)){
return (1);
}
return null;
}
}