maven pom
<dependencies> <!--前端jqury--> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.16.1</version> </dependency> <!--http工具--> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpcore</artifactId> <version>4.4.16</version> </dependency> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>4.5.14</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.13.0</version> </dependency> </dependencies>
====================================
遍历网站内容爬取网站网址
package com.xiaocao;
import com.sun.org.apache.bcel.internal.generic.NEW;
import com.sun.org.apache.regexp.internal.RE;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class UrlPool {
public static void main(String[] args) {
/*首页地址*/
getUrl("https://www.nipic.com/");
}
private static void getUrl(String baseUrl) {
Map<String, Boolean> oldMap = new LinkedHashMap<>();
/*相对路径拼接*/
String oldLinkHost = "";
Pattern p = Pattern.compile("(https?://)?[^\\s]*");
Matcher m = p.matcher(baseUrl);
if (m.find()) {
oldLinkHost = m.group();
}
oldMap.put(baseUrl, false);
oldMap = crawlLinks(oldLinkHost, oldMap);
for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {
System.out.println("连接:" + mapping.getKey());
}
}
private static Map<String, Boolean> crawlLinks(String oldLinkHost, Map<String, Boolean> oldMap) {
LinkedHashMap<String, Boolean> newMap = new LinkedHashMap<>();
String oldLink = "";
for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {
if (!mapping.getValue()){
System.out.println(mapping.getKey()+"连接有参数:" + mapping.getKey());
oldLink = mapping.getKey();
try {
URL url = new URL(oldLink);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
if (connection.getResponseCode() == 200) {
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
// Pattern p = Pattern.compile("<a.*?href=[\"']?(https?://)?/?[^\"']?.*?>(.+)</a>");
Pattern p = Pattern.compile("<a\\b[^>]+\\bhref=\"([^\"]*)\"[^>]*>([\\s\\S]*?)</a>");
Matcher matcher = null;
String line = "";
while ((line = reader.readLine()) != null) {
matcher = p.matcher(line);
if (matcher.find()) {
String newLink = matcher.group(1);
if (!newLink.startsWith("http")) {
/*相对路径*/
if (newLink.startsWith("/")) {
newLink = oldLinkHost + newLink;
} else {
newLink = oldLinkHost + "/" + newLink;
}
}
if (newLink.endsWith("/")) {
newLink = newLink.substring(0, newLink.length() - 1);
}
if (!oldMap.containsKey(newLink) && !newMap.containsKey(newLink) && newLink.startsWith(oldLinkHost)) {
newMap.put(newLink, false);
}
}
}
}
} catch (Exception e) {
} finally {
oldMap.replace(oldLink, true);
}
}
}
if (!newMap.isEmpty()) {
oldMap.putAll(newMap);
oldMap.putAll(crawlLinks(oldLinkHost, oldMap));
}
return oldMap;
}
}
==============
下载网站内容
package com.xiaocao; import org.apache.commons.io.FileUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import sun.net.www.http.HttpClient; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.HttpCookie; public class ImageCraw { private static String url = "https://xxx"; public static void main(String[] args) { // apacheHttpClient(); try { Document document = Jsoup.connect(url).get(); Elements select = document.select(".newdetail-skin #J_worksImg"); try { Connection.Response src = Jsoup.connect("https:"+select.attr("src")) .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0") .ignoreContentType(true) .execute(); String name = select.attr("alt"); System.out.println(name); ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes()); FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg")); }catch (Exception e){ e.printStackTrace(); } // for (int i = 0; i < select.size(); i++) { // Elements img = select.get(i).select(".newdetail-skin #J_worksImg"); // // try { // Connection.Response src = Jsoup.connect("https:"+img.attr("src")) // .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0") // .ignoreContentType(true) // .execute(); // // String name = img.attr("alt"); // System.out.println(name); // ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes()); // FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg")); // }catch (Exception e){ // e.printStackTrace(); // } // // } } catch (IOException e) { e.printStackTrace(); } } private static void apacheHttpClient() { CloseableHttpClient client = HttpClients.createDefault(); HttpGet httpGet = new HttpGet(url); /*伪装浏览器*/ httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0"); try { CloseableHttpResponse execute = client.execute(httpGet); HttpEntity entity = execute.getEntity(); String s = EntityUtils.toString(entity); System.out.println(s); } catch (IOException e) { e.printStackTrace(); } } }