java实现爬虫爬网站图片的实例代码

时间:2022-09-22 20:57:11

第一步,实现 linkqueue,对url进行过滤和存储的操作

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import java.util.arraylist;
import java.util.collections;
import java.util.hashset;
import java.util.list;
import java.util.set;
public class linkqueue {
  // 已访问的 url 集合
  private static set<string> visitedurl = collections.synchronizedset(new hashset<string>());
  // 未访问的url
  private static list<string> unvisitedurl = collections.synchronizedlist(new arraylist<string>());
  // 未访问的url出队列
  public static string unvisitedurldequeue() {
    if (unvisitedurl.size() > 0) {
      string url = unvisitedurl.remove(0);
      visitedurl.add(url);
      return url;
    }
    return null;
  }
  // 新的url添加进来的时候进行验证,保证只是添加一次
  public static void addunvisitedurl(string url) {
    if (url != null && !url.trim().equals("") && !visitedurl.contains(url)
        && !unvisitedurl.contains(url))
      unvisitedurl.add(url);
  }
  // 判断未访问的url队列中是否为空
  public static boolean unvisitedurlsempty() {
    return unvisitedurl.isempty();
  }
}

第二步,收集每一个url下的链接进行过滤产生新的链接

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import java.util.hashset;
import java.util.set;
import org.htmlparser.node;
import org.htmlparser.nodefilter;
import org.htmlparser.parser;
import org.htmlparser.filters.nodeclassfilter;
import org.htmlparser.filters.orfilter;
import org.htmlparser.tags.linktag;
import org.htmlparser.util.nodelist;
import org.htmlparser.util.parserexception;
/**
 * 过滤http的url,获取可以符合规则的url
 * @author administrator
 *
 */
public class parserhttpurl {
  // 获取一个网站上的链接,filter 用来过滤链接
  public static set<string> extraclinks(string url, linkfilter filter) {
    set<string> links = new hashset<string>();
    try {
      parser parser = new parser(url);
      // 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
      nodefilter framefilter = new nodefilter() {
        public boolean accept(node node) {
          if (node.gettext().startswith("frame src=")) {
            return true;
          } else {
            return false;
          }
        }
      };
      // orfilter 来设置过滤 <a> 标签,和 <frame> 标签
      orfilter linkfilter = new orfilter(new nodeclassfilter(
          linktag.class), framefilter);
      // 得到所有经过过滤的标签
      nodelist list = parser.extractallnodesthatmatch(linkfilter);
      for (int i = 0; i < list.size(); i++) {
        node tag = list.elementat(i);
        if (tag instanceof linktag)// <a> 标签
        {
          linktag link = (linktag) tag;
          string linkurl = link.getlink();// url
          if (filter.accept(linkurl))
            links.add(linkurl);
        } else// <frame> 标签
        {
          // 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
          string frame = tag.gettext();
          int start = frame.indexof("src=");
          frame = frame.substring(start);
          int end = frame.indexof(" ");
          if (end == -1)
            end = frame.indexof(">");
          string frameurl = frame.substring(5, end - 1);
          if (filter.accept(frameurl))
            links.add(frameurl);
        }
      }
    } catch (parserexception e) {
      e.printstacktrace();
    }
    return links;
  }
}

第三步,实现图片下载功能 

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import java.io.file;
import java.io.fileoutputstream;
import java.io.inputstream;
import java.net.url;
import java.net.urlconnection;
import java.util.arraylist;
import java.util.list;
import java.util.regex.matcher;
import java.util.regex.pattern;
/***
 * java抓取网络图片
 *
 * @author swinglife
 *
 */
public class downloadpic {
  // 编码
  private static final string ecoding = "utf-8";
  // 获取img标签正则
  private static final string imgurl_reg = "<img.*src=(.*?)[^>]*?>";
  // 获取src路径的正则
  private static final string imgsrc_reg = "http:"?(.*?)("|>|\s+)";
  public static void downloadpic(string url) {
    // 获得html文本内容
    string html = null;
    try {
      html = downloadpic.gethtml(url);
    } catch (exception e) {
      e.printstacktrace();
    }
    if (null != html && !"".equals(html)) {
      // 获取图片标签
      list<string> imgurl = downloadpic.getimageurl(html);
      // 获取图片src地址
      list<string> imgsrc = downloadpic.getimagesrc(imgurl);
      // 下载图片
      downloadpic.download(imgsrc);
    }
  }
  /***
   * 获取html内容
   *
   * @param url
   * @return
   * @throws exception
   */
  private static string gethtml(string url) throws exception {
    url uri = new url(url);
    urlconnection connection = uri.openconnection();
    inputstream in = connection.getinputstream();
    byte[] buf = new byte[1024];
    int length = 0;
    stringbuffer sb = new stringbuffer();
    while ((length = in.read(buf, 0, buf.length)) > 0) {
      sb.append(new string(buf, ecoding));
    }
    in.close();
    return sb.tostring();
  }
  /***
   * 获取imageurl地址
   *
   * @param html
   * @return
   */
  private static list<string> getimageurl(string html) {
    matcher matcher = pattern.compile(imgurl_reg).matcher(html);
    list<string> listimgurl = new arraylist<string>();
    while (matcher.find()) {
      listimgurl.add(matcher.group());
    }
    return listimgurl;
  }
  /***
   * 获取imagesrc地址
   *
   * @param listimageurl
   * @return
   */
  private static list<string> getimagesrc(list<string> listimageurl) {
    list<string> listimgsrc = new arraylist<string>();
    for (string image : listimageurl) {
      matcher matcher = pattern.compile(imgsrc_reg).matcher(image);
      while (matcher.find()) {
        listimgsrc.add(matcher.group().substring(0,
            matcher.group().length() - 1));
      }
    }
    return listimgsrc;
  }
  /***
   * 下载图片
   *
   * @param listimgsrc
   */
  private static void download(list<string> listimgsrc) {
    for (string url : listimgsrc) {
      try {
        string imagename = url.substring(url.lastindexof("/") + 1,
            url.length());
        url uri = new url(url);
        inputstream in = uri.openstream();
        fileoutputstream fo = new fileoutputstream(new file(imagename));
        byte[] buf = new byte[1024];
        int length = 0;
        while ((length = in.read(buf, 0, buf.length)) != -1) {
          fo.write(buf, 0, length);
        }
        in.close();
        fo.close();
      } catch (exception e) {
        e.printstacktrace();
      }
    }
  }
}

实在filter接口,定义过滤接口:

?
1
2
3
public interface filter {
  public boolean accept(string url);
}

第四步,过滤规则的实现:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
public class crawler {
  /**
   * 抓取过程
   *
   * @return
   * @param seeds
   */
  public void crawling(string url) { // 定义过滤器
    filter filter = new filter() {
      public boolean accept(string url) {
        //这里过滤规则随需要爬的网站的规则进行改变,推荐使用正则实现,本人是爬豆瓣网站
        if(url.indexof("douban.com/group/topic") != -1 || url.indexof("douban.com/group/haixiuzu/discussion?start") != -1 )
          return true;
        else
          return false;
      }
    };
    // 初始化 url 队列
    linkqueue.addunvisitedurl(url);
    // 循环条件,待抓取的链接不空
    while (!linkqueue.unvisitedurlsempty()) {
      // 队头url出队列
      string visiturl = (string) linkqueue.unvisitedurldequeue();
      if (visiturl == null)
        continue;
      downloadpic.downloadpic(visiturl);
      // 提取出下载网页中的 url
      set<string> links = parserhttpurl.extraclinks(visiturl, filter);
      // 新的未访问的 url 入队
      for (string link : links) {
        linkqueue.addunvisitedurl(link);
      }
    }
  }
  // main 方法入口
  public static void main(string[] args) {
    crawler crawler = new crawler();
    crawler.crawling("http://www.douban.com/group/haixiuzu/discussion?start=0");
  }
}

总结

以上所述是小编给大家介绍的java实现爬虫爬网站图片的实例代码,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对服务器之家网站的支持!

原文链接:https://blog.csdn.net/dingzfeng/article/details/80536987