爬虫:实现网站的全部图片抓取

时间:2024-12-17 07:14:28

--------------2018/12/11 update-------------------

爬取的网站已经关闭,此爬虫已经失效,代码也不会再维护,不建议学习此代码等设计风格。写的实在挺烂(摔!

感兴趣的同学浏览下就好。

 

-----2017-5-22再更新-----------------

现在爬虫好像出问题了。。。。。。。。。好像很多同学都对这个感兴趣,那我就放到GitHub上维护吧:/qq1367212627/youmziSpider   感兴趣的可以去这个地址看

 

 

---------2016-----------------

 

花了一天左右的时间,用Java写了一个图片爬取,理论上是可以将所有的图片爬下的,但是整个站点图片太多了,所以只测试的爬了9000张左右的图片。好啦先看效果图。
 




接下来是代码,这个简单的小爬虫是基于httpclient,所以大家使用代码,还要记得下载依赖库才可以运行(依赖库下载地址:Apache HttpComponents),网页解析使用正则解析的,还是比较简单的小爬虫。

以下,代码(代码风格很混乱,请谅解):
主程序入口:Main

 

import ;
import ;
import ;


public class Main {
    public static Set<String> set =null;
    public static void main(String[] args) {
        ("D:\\youmzi");                   //图片保存路径
        set = new HashSet<>();
        ArrayList<String> Page_Link = new ArrayList<>();
        ArrayList<PictMsg> Pict_Link =new ArrayList<>();

        Page_Link.add("/xg/");
        Page_Link.add("/");
         Page_Link.add("/");//                 gif图
        Page_Link.add("/");
        Page_Link.add("/");
        Page_Link.add("/");
        Page_Link.add("/");
        Page_Link.add("/");
        Page_Link.add("/");

        while(Page_Link.size()>0){
            String url=Page_Link.get(0);
            Find_Link.Add_Page_Link(url,Page_Link);
            Find_Link.Add_Pict_Link(url,Pict_Link);
            (Pict_Link);
            Page_Link.remove(0);
        }
    }
}

 

 

        import ;
        import ;
        import  ;

/**
 * Created by lewis on 2016/10/20.
 */
public class Find_Link {



    public static boolean Add_Page_Link(String Context, ArrayList<String> Page_link) {
        String link=null;
        String fa="<a href=(['\"]?)(?!http)((?!js|css)[^\"' \\r\\n])+\\1>下一页";
        Pattern r= (fa);
        Matcher m = ((Context));
        if ((0)) {
            link = ();
            String pa = "<a href='(.+?)'>下一页";
            r = (pa);
            m = (link);
            if ((0)) {
                link = (1);
                if (!("#") && link != null&&!(link)) {
                    (link);
                    Page_link.add("/" + link);                     //获得捕获组1,一共2个组,被匹配的字符算一个组
                }
            }
        }
        return (0)&&(!("#"))&&link!=null;
    }

    public static void Add_Pict_Link(String Context,ArrayList<PictMsg> Pict_link) {
        String pa;
        Pattern r;
        Matcher m ;
        pa="<a href=\"(.+?)\" title=\"(.+?)\" target=\"_blank\">(.+?)<\\/a>";
        r= (pa);
        m = ((Context));
        while(()) {
            String url=(1);
            String head=(2);
            if(!(url)){
                Pict_link.add(new PictMsg(url,head));
                (url);
            }
        }
    }

}

 

/**
 * Created by lewis on 2016/10/21.
 */
public class PictMsg {
    private String url;
    private String headline;

    public PictMsg(String url, String headline) {
         = url;
         = headline;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
         = url;
    }

    public String getHeadline() {
        return headline;
    }

    public void setHeadline(String headline) {
         = headline;
    }

    @Override
    public String toString() {
        return "网址:"+url+"标题:"+headline;
    }
}

 

        import ;
        import ;
        import ;
        import ;
        import ;

        import .*;
        import ;
        import ;
        import ;

/**
 * Created by lewis on 2016/10/20.
 */
public class DownLoad {

    public static CloseableHttpClient httpClient = ().build();

    public static String downloadHtml(String url) {

        CloseableHttpResponse response = null;
        BufferedReader br=null;
        HttpGet httpGet = new HttpGet(url);

        try {
            response = (httpGet);
            HttpEntity entity = ();
            InputStreamReader isr = new InputStreamReader((),"gb2312");

            StringBuilder stringBuilder =new StringBuilder();
            br =new BufferedReader(isr);
            String line =null;
            while((line=())!=null){
                (line+'\n');
            }
            return ();
        } catch (IOException e) {
            ();
        }finally {
            if(br!=null){
                try {
                    ();
                } catch (IOException e) {
                    ();
                }
            }
        }
        return null;
    }

    public static void downloadPict(PictMsg pictMsg,int count) {
        String url=();
        CloseableHttpResponse response;
        OutputStream out = null;
        InputStream in=null;
        BufferedReader br=null;
        byte buffer[] = new byte[1024];
        if(url!=null){
            try {
                HttpGet httpGet = new HttpGet(url);
                response = (httpGet);
                HttpEntity entity = ();
                in = ();
                CreateDir("D:\\youmzi"++());
                String suffix;
                if((()-1)=='g') {
                    suffix=".jpg";
                }
                else{
                    suffix=".gif";
                }
                ("正在下载:"+"D:\\youmzi"++()++count+suffix+":");
                out = new FileOutputStream(new File("D:\\youmzi"++()++count+suffix));
                int index=0;
                while((index=(buffer))!=-1){
                    (buffer,0,index);
                }
                ();
            } catch (IOException e) {
                ();
            }finally {
                try {
                    if (br!=null){
                        ();
                    }
                    if(out!=null){
                        ();
                    }
                    if(in!=null){
                        ();
                    }
                } catch (IOException e) {
                    ();
                }
            }
        }

    }

    public static void downloadPict(ArrayList<PictMsg> Pict_link){

        for(int i = 0;i< Pict_link.size();i++){
            // (Pict_link.get(i));
            if(Pict_link.get(i)!=null)
                DownLoad_All_PictSoruce(Pict_link.get(i));
        }
        Pict_link.clear();
    }

    public static void CreateDir(String dir){
        File file = new File(dir);
        if(!()){
            ();
        }
    }

    public static void DownLoad_All_PictSoruce(PictMsg pictMsg){
        ArrayList<String> All_Pict_Soruce = new ArrayList<>();
        String  url =();
        All_Pict_Soruce.add(url);

        while(Find_Link.Add_Page_Link(url,All_Pict_Soruce)){     //通过循环一直找到最后一个页面
            url=All_Pict_Soruce.get(All_Pict_Soruce.size()-1);
        }

        for(int i =0;i<All_Pict_Soruce.size();i++){
            //(Pict_down_Soruce(All_Pict_Soruce.get(i)));
            if(All_Pict_Soruce.get(i)!=null){
                String link=Pict_down_Soruce(All_Pict_Soruce.get(i));
                if(!(link)) {
                    downloadPict(new PictMsg(link, ()), i);
                    ("一共有:"+All_Pict_Soruce.size()+","+"还剩下:"+(All_Pict_Soruce.size()-i));
                    (link);
                }
            }
        }
        All_Pict_Soruce.clear();
    }

    public static String Pict_down_Soruce(String url){
        String context = (url);
        String pa;
        Pattern r;
        Matcher m ;
        pa="<img src='(.+?)' alt=";
        r= (pa);
        m = (context);
        if((0)){
            return (1);
        }
        return null;
    }
}