详解JAVA抓取网页的图片,JAVA利用正则表达式抓取网站图片

时间:2022-09-19 07:37:02

利用Java抓取网页上的所有图片

用两个正则表达式

1、匹配html中img标签的正则:<img.*src=(.*?)[^>]*?>

2、匹配img标签中得src中http路径的正则:http:\"?(.*?)(\"|>|\\s+)

实现:

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package org.swinglife.main;
 
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
/***
 * java抓取网络图片
 * @author swinglife
 *
 */
public class CatchImage {
 
  // 地址
  private static final String URL = "http://www.zzvips.com";
  // 编码
  private static final String ECODING = "UTF-8";
  // 获取img标签正则
  private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
  // 获取src路径的正则
  private static final String IMGSRC_REG = "http:\"?(.*?)(\"|>|\\s+)";
 
   
  public static void main(String[] args) throws Exception {
    CatchImage cm = new CatchImage();
    //获得html文本内容
    String HTML = cm.getHTML(URL);
    //获取图片标签
    List<String> imgUrl = cm.getImageUrl(HTML);
    //获取图片src地址
    List<String> imgSrc = cm.getImageSrc(imgUrl);
    //下载图片
    cm.Download(imgSrc);
  }
   
   
  /***
   * 获取HTML内容
   *
   * @param url
   * @return
   * @throws Exception
   */
  private String getHTML(String url) throws Exception {
    URL uri = new URL(url);
    URLConnection connection = uri.openConnection();
    InputStream in = connection.getInputStream();
    byte[] buf = new byte[1024];
    int length = 0;
    StringBuffer sb = new StringBuffer();
    while ((length = in.read(buf, 0, buf.length)) > 0) {
      sb.append(new String(buf, ECODING));
    }
    in.close();
    return sb.toString();
  }
 
  /***
   * 获取ImageUrl地址
   *
   * @param HTML
   * @return
   */
  private List<String> getImageUrl(String HTML) {
    Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML);
    List<String> listImgUrl = new ArrayList<String>();
    while (matcher.find()) {
      listImgUrl.add(matcher.group());
    }
    return listImgUrl;
  }
 
  /***
   * 获取ImageSrc地址
   *
   * @param listImageUrl
   * @return
   */
  private List<String> getImageSrc(List<String> listImageUrl) {
    List<String> listImgSrc = new ArrayList<String>();
    for (String image : listImageUrl) {
      Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
      while (matcher.find()) {
        listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));
      }
    }
    return listImgSrc;
  }
 
  /***
   * 下载图片
   *
   * @param listImgSrc
   */
  private void Download(List<String> listImgSrc) {
    try {
      for (String url : listImgSrc) {
        String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
        URL uri = new URL(url);
        InputStream in = uri.openStream();
        FileOutputStream fo = new FileOutputStream(new File(imageName));
        byte[] buf = new byte[1024];
        int length = 0;
        System.out.println("开始下载:" + url);
        while ((length = in.read(buf, 0, buf.length)) != -1) {
          fo.write(buf, 0, length);
        }
        in.close();
        fo.close();
        System.out.println(imageName + "下载完成");
      }
    } catch (Exception e) {
      System.out.println("下载失败");
    }
  }
 
   
}

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。

原文链接:http://blog.csdn.net/swingpyzf/article/details/16338903