爬虫基础

时间:2024-10-28 16:05:51

maven pom

<dependencies>

    <!--前端jqury-->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.16.1</version>
    </dependency>

    <!--http工具-->
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpcore</artifactId>
        <version>4.4.16</version>
    </dependency>
    <dependency>
        <groupId>org.apache.httpcomponents</groupId>
        <artifactId>httpclient</artifactId>
        <version>4.5.14</version>
    </dependency>

    <dependency>
        <groupId>commons-io</groupId>
        <artifactId>commons-io</artifactId>
        <version>2.13.0</version>
    </dependency>
</dependencies>

====================================

遍历网站内容爬取网站网址

package com.xiaocao;

import com.sun.org.apache.bcel.internal.generic.NEW;
import com.sun.org.apache.regexp.internal.RE;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class UrlPool {

    public static void main(String[] args) {
        /*首页地址*/
        getUrl("https://www.nipic.com/");
    }

    private static void getUrl(String baseUrl) {
        Map<String, Boolean> oldMap = new LinkedHashMap<>();
        /*相对路径拼接*/
        String oldLinkHost = "";
        Pattern p = Pattern.compile("(https?://)?[^\\s]*");
        Matcher m = p.matcher(baseUrl);
        if (m.find()) {
            oldLinkHost = m.group();
        }
        oldMap.put(baseUrl, false);
        oldMap = crawlLinks(oldLinkHost, oldMap);
        for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {
            System.out.println("连接:" + mapping.getKey());
        }
    }

    private static Map<String, Boolean> crawlLinks(String oldLinkHost, Map<String, Boolean> oldMap) {

        LinkedHashMap<String, Boolean> newMap = new LinkedHashMap<>();
        String oldLink = "";
        for (Map.Entry<String, Boolean> mapping : oldMap.entrySet()) {

            if (!mapping.getValue()){
                System.out.println(mapping.getKey()+"连接有参数:" + mapping.getKey());
                oldLink = mapping.getKey();
                try {
                    URL url = new URL(oldLink);

                    HttpURLConnection connection = (HttpURLConnection) url.openConnection();

                    connection.setRequestMethod("GET");
                    if (connection.getResponseCode() == 200) {
                        BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()));
//                        Pattern p = Pattern.compile("<a.*?href=[\"']?(https?://)?/?[^\"']?.*?>(.+)</a>");
                        Pattern p = Pattern.compile("<a\\b[^>]+\\bhref=\"([^\"]*)\"[^>]*>([\\s\\S]*?)</a>");
                        Matcher matcher = null;
                        String line = "";
                        while ((line = reader.readLine()) != null) {
                            matcher = p.matcher(line);
                            if (matcher.find()) {
                                String newLink = matcher.group(1);
                                if (!newLink.startsWith("http")) {
                                    /*相对路径*/
                                    if (newLink.startsWith("/")) {
                                        newLink = oldLinkHost + newLink;
                                    } else {
                                        newLink = oldLinkHost + "/" + newLink;
                                    }
                                }
                                if (newLink.endsWith("/")) {
                                    newLink = newLink.substring(0, newLink.length() - 1);
                                }
                                if (!oldMap.containsKey(newLink) && !newMap.containsKey(newLink) && newLink.startsWith(oldLinkHost)) {
                                    newMap.put(newLink, false);
                                }

                            }
                        }
                    }
                } catch (Exception e) {


                } finally {

                    oldMap.replace(oldLink, true);

                }
            }
        }

        if (!newMap.isEmpty()) {
            oldMap.putAll(newMap);
            oldMap.putAll(crawlLinks(oldLinkHost, oldMap));
        }
        return oldMap;
    }
}


==============

下载网站内容

package com.xiaocao;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sun.net.www.http.HttpClient;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpCookie;

public class ImageCraw {

    private static String url = "https://xxx";

    public static void main(String[] args) {
//        apacheHttpClient();
        try {
            Document document = Jsoup.connect(url).get();

            Elements select = document.select(".newdetail-skin #J_worksImg");

            try {
                Connection.Response src = Jsoup.connect("https:"+select.attr("src"))
                        .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0")
                        .ignoreContentType(true)
                        .execute();

                String name = select.attr("alt");
                System.out.println(name);
                ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes());
                FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg"));
            }catch (Exception e){
                e.printStackTrace();
            }


//            for (int i = 0; i < select.size(); i++) {
//                Elements img = select.get(i).select(".newdetail-skin #J_worksImg");
//
//                try {
//                    Connection.Response src = Jsoup.connect("https:"+img.attr("src"))
//                            .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0")
//                            .ignoreContentType(true)
//                            .execute();
//
//                    String name = img.attr("alt");
//                    System.out.println(name);
//                    ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(src.bodyAsBytes());
//                    FileUtils.copyInputStreamToFile(byteArrayInputStream,new File("F:\\filetest\\"+name+".jpg"));
//                }catch (Exception e){
//                    e.printStackTrace();
//                }
//
//            }
        } catch (IOException e) {
            e.printStackTrace();
        }


    }

    private static void apacheHttpClient() {


        CloseableHttpClient client = HttpClients.createDefault();
        HttpGet httpGet = new HttpGet(url);
        /*伪装浏览器*/
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0");
        
        try {
            CloseableHttpResponse execute = client.execute(httpGet);
            HttpEntity entity = execute.getEntity();
            String s = EntityUtils.toString(entity);
            System.out.println(s);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}