Java多线程爬虫和存储

import org.apache.http.HttpHeaders;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.util.EntityUtils;
import org.dom4j.Element;

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class GetBookInfoThread extends Thread{
    private CloseableHttpClient httpClient;
    private String webAddress;
    private Element rootElement;
    private Pattern bookAuthorRegex;
    private Pattern bookPublishRegex;
    private Pattern bookIsbnRegex;
    private Pattern bookImgRegex;
    private String bookName;

/**
 *
 * @param httpClient 用这个操作抓取
 * @param webAddress 这个是抓取的网址
 * @param rootElement 这个是一个xml文档的根节点,用这个来操作加入新的子节点
 */
    public GetBookInfoThread(CloseableHttpClient httpClient,String webAddress,String bookName,Element rootElement,Pattern bookAuthorRegex,Pattern bookPublishRegex,Pattern bookIsbnRegex,Pattern bookImgRegex) {
        this.httpClient = httpClient;
        this.webAddress = webAddress;
        this.rootElement = rootElement;
        this.bookAuthorRegex = bookAuthorRegex;
        this.bookPublishRegex = bookPublishRegex;
        this.bookIsbnRegex = bookIsbnRegex;
        this.bookName = bookName;
        this.bookImgRegex = bookImgRegex;
    }

    @Override
    public void run() {
        HttpGet getBookInfo = new HttpGet(webAddress);
        getBookInfo.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");
        CloseableHttpResponse bookInfoResponse;
        String bookInfoCode = null;//书籍具体信息网页源码
        try {
            bookInfoResponse = httpClient.execute(getBookInfo);
            if (bookInfoResponse.getStatusLine().getStatusCode() != 200) {
                System.out.println("获取书本具体信息时出错,页面地址:" + webAddress + "错误信息" + bookInfoResponse.getStatusLine());
                return;
            }

            bookInfoCode = EntityUtils.toString(bookInfoResponse.getEntity());
        } catch (IOException e) {
            e.printStackTrace();
        }

        Matcher bookAuthorMatcher = bookAuthorRegex.matcher(bookInfoCode); //匹配作者
        Matcher bookPublishMatcher = bookPublishRegex.matcher(bookInfoCode); //匹配出版商
        Matcher bookIsbnMatcher = bookIsbnRegex.matcher(bookInfoCode); //匹配isbn
        Matcher bookImgMatcher = bookImgRegex.matcher(bookInfoCode); //匹配图片地址

        String bookName = this.bookName;
        String bookAuthor = "";
        String bookPublish = "";
        String bookIsbn = "";
        String bookLink = webAddress;
        String bookImg = "";

        if (bookAuthorMatcher.find()) {
            bookAuthor = bookAuthorMatcher.group(1);
        }
        if (bookPublishMatcher.find()) {
            bookPublish = bookPublishMatcher.group(1);
        }
        if (bookIsbnMatcher.find()) {
            bookIsbn = bookIsbnMatcher.group(1);
        }
        if (bookImgMatcher.find()) {
            bookImg = bookImgMatcher.group(1);
        }

//                    System.out.println(bookName + "-" + bookAuthor + "-" + bookPublish + "-" + bookIsbn);

        Element bookElement = rootElement.addElement("book");//新建一个书的标签
        bookElement.addAttribute("id",String.valueOf(Main.bookId++));
        bookElement.addElement("name").setText(bookName);
        bookElement.addElement("author").setText(bookAuthor);
        bookElement.addElement("publish").setText(bookPublish);
        bookElement.addElement("isbn").setText(bookIsbn);
        bookElement.addElement("count").setText(String.valueOf((int)(Math.random() * 10) + 3));
        bookElement.addElement("link").setText(bookLink);
        bookElement.addElement("img").setText(bookImg);

        System.out.println("抓取了:" + webAddress + " " + bookName);
    }
}

import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;

import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {

    CloseableHttpClient httpClient;
    static int bookId = 496;
    Map<String,Integer> proxyMap;//ip->端口
    List<String> ipList;//从这个list中读出ip，再由ip从map中读出端口
    int i = 0;//根据这个从list中取出ip，换上对应的代理

    public static void main(String[] args) {
        Main m = new Main();

//        List<String> tagList = m.getTagList();
        List<String> tagList = new LinkedList<String>();
//        tagList.add("经典");
//        tagList.add("日本文学");
//        tagList.add("散文");
//        tagList.add("中国文学");
//        tagList.add("算法");
//        tagList.add("童话");
//        tagList.add("外国文学");
//        tagList.add("文学");
//        tagList.add("小说");
//        tagList.add("漫画");
//        tagList.add("诗词");
//        tagList.add("心理学");
        tagList.add("摄影");
        tagList.add("理财");
        tagList.add("经济学");
        m.pullAndWrite(tagList,10);
    }

    public Main() {
//        HttpHost proxy = new HttpHost("122.225.106.35",80);
//        httpClient = HttpClients.custom().setProxy(proxy).build();
        httpClient = HttpClients.createDefault();
        setProxyMap();
    }

    public void setProxyMap() {
        proxyMap = new HashMap<String, Integer>();
        ipList = new LinkedList<String>();
        proxyMap.put("211.68.122.171",80);ipList.add("211.68.122.171");
    }

    public List<String> getTagList() {
        HttpGet getTag = new HttpGet("http://book.douban.com/tag/");
        getTag.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");
        CloseableHttpResponse tagPageResponse = null;
        String tagPageCode = null;//网页源码
        try {
            tagPageResponse = httpClient.execute(getTag);
            tagPageCode = EntityUtils.toString(tagPageResponse.getEntity());
            tagPageResponse.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                tagPageResponse.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        Pattern p = Pattern.compile("class=\"tag\">(.*?)</a>");
        Matcher m = p.matcher(tagPageCode);
        List<String> resultTagList = new LinkedList<String>();
        while (m.find()) {
            resultTagList.add(m.group(1));
        }

        return resultTagList;
    }

/**
 *
 * @param tagList 要抓的图书的类别
 * @param maxPageNum 每种图书最多抓取的页数
 */
    public void pullAndWrite(List<String> tagList,int maxPageNum) {
        Pattern bookAddressRegex = Pattern.compile("href=\"(.*?)\" class=\"title\" target=\"_blank\">(.*?)</a>"); //获取具体书籍网址的正则
        Pattern bookAuthorRegex = Pattern.compile("(?s)<span class=\"pl\"> 作者</span>:.*?>(.*?)</a>");//匹配作者
        Pattern bookPublishRegex = Pattern.compile("<span class=\"pl\">出版社:</span> (.*?)<br/>");
        Pattern bookIsbnRegex = Pattern.compile("<span class=\"pl\">ISBN:</span> (.*?)<br/>");
        Pattern bookImgRegex = Pattern.compile("<img src=\"(.*?)\" title=\"点击看大图\"");

        //分别抓取每一种类别的书籍
        for (String tag:tagList) {
            int nowPageNum = 0;//目前正在抓取的页数
            Document newDocument = DocumentHelper.createDocument();
            Element rootElement = newDocument.addElement("root");

            while (nowPageNum < maxPageNum) {
                System.out.println(1);
                String nowPageAddress = "http://www.douban.com/tag/" + tag + "/book?start=" + nowPageNum * 15;//当前页的网址
                HttpGet getBooksPage = new HttpGet(nowPageAddress);
                getBooksPage.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30");
                CloseableHttpResponse booksPageResponse;
                Matcher m = null;
                try {
                    System.out.println(2);
                    booksPageResponse = httpClient.execute(getBooksPage);
                    System.out.println(3);
                    m = bookAddressRegex.matcher(EntityUtils.toString(booksPageResponse.getEntity()));
                    booksPageResponse.close();
                    if (booksPageResponse.getStatusLine().getStatusCode() != 200) {
                        System.out.println("抓 " + nowPageAddress + " 时出错:");
                        System.out.println("错误信息:" + booksPageResponse.getStatusLine());
                        changeProxy();
                        continue;//换个代理继续爬当前页
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
                //具体每一本书，具体抓取
                int findCount = 0;//找到的书籍的数目
                List<Thread> threadList = new LinkedList<Thread>();
                while (m.find()) {
                    threadList.add(new GetBookInfoThread(httpClient, m.group(1), m.group(2), rootElement, bookAuthorRegex, bookPublishRegex, bookIsbnRegex,bookImgRegex));
                    findCount++;
                }
                //没有知道到代表这种类别的书都找完了，那么直接退出此类书籍的查找
                if (findCount == 0) {
break;
                }

                for (Thread thread:threadList) {
                    thread.start();
                }
                for (Thread thread:threadList) {
                    try {
                        thread.join();
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }
                nowPageNum++;
            }
            //一个类别爬完了再写入
            new WriteBookInfoToFile(rootElement,"/home/geekgao/book/" + tag + ".xml").start(); //另开一个线程写入文件

        }
    }

    private void changeProxy() {
        if (i >= ipList.size()) {
            System.out.println("代理用完了,退出");
            System.exit(0);
        }
        String ip = ipList.get(i++);
        httpClient = HttpClients.custom().setProxy(new HttpHost(ip,proxyMap.get(ip))).build();
        System.out.println("换代理啦,使用代理:" + ip + "，端口:" + proxyMap.get(ip));
    }

}

import org.dom4j.Element;
import org.dom4j.io.XMLWriter;

import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;

public class WriteBookInfoToFile extends Thread {
private Element root;
private String fileAddress;

public WriteBookInfoToFile(Element root,String fileAddress) {
this.root = root;
this.fileAddress = fileAddress;
    }

@Override
public void run() {
        Writer fileWriter;
try {
            fileWriter = new FileWriter(fileAddress);
            XMLWriter xmlWriter = new XMLWriter(fileWriter);
            xmlWriter.write(root);
            xmlWriter.close();
            System.out.println("[" + fileAddress + "]写入成功");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;

import java.io.File;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.List;

public class WriteInfoToDB {
    public static void main(String[] args) {
        File folder = new File("/home/geekgao/book");
        File[] XMLS = folder.listFiles();
        SAXReader reader = new SAXReader();
        Statement statement = null; //用这个执行sql语句
        try {
            Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动
            statement = DriverManager.getConnection("jdbc:mysql://localhost:3306/BookManage?user=root&password=root").createStatement();
        } catch (SQLException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }

        for (File f:XMLS) {
            if (f.isDirectory()) {
                continue;
            }
            Document document = null;
            try {
                document = reader.read(f);
            } catch (DocumentException e) {
                e.printStackTrace();
            }

            Element root = document.getRootElement();
            List<Element> books = root.elements();
            for (Element book:books) {
                String name = null;
                String author = null;
                String publish = null;
                String isbn = null;
                String count = null;
                String link = null;
                String img = null;
                List<Element> b = book.elements();
                for (Element info:b) {
                    if (info.getName().equals("name")) {
                        name = info.getText();
                    } else if (info.getName().equals("author")) {
                        author = info.getText();
                    } else if (info.getName().equals("publish")) {
                        publish = info.getText();
                    } else if (info.getName().equals("isbn")) {
                        isbn = info.getText();
                    } else if (info.getName().equals("count")) {
                        count = info.getText();
                    } else if (info.getName().equals("link")) {
                        link = info.getText();
                    } else if (info.getName().equals("img")) {
                        img = info.getText();
                    }
//                    System.out.println(info.getName() + ": " + info.getText());
                }
                String sql = "INSERT INTO Book(bookPublish,bookName,bookAuthor,bookTag,bookIsbn,bookCount,bookRestCount,bookLink,bookImg) VALUES ('" + publish + "','" + name + "','" + author + "','" + f.getName().split("\\.")[0] + "','" + isbn + "','" + count + "','" + count + "','" + link + "','" + img + "');";
                try {
                    statement.execute(sql);
                } catch (SQLException e) {
                    System.err.println("sql语句处错误:" + e.getMessage());
                    System.err.println("sql语句:" + sql);
                }
            }
        }
    }
}

秒客网

Java多线程爬虫和存储

相关文章