import org.apache.http.HttpHeaders
import org.apache.http.client.methods.CloseableHttpResponse
import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.util.EntityUtils
import org.dom4j.Element
import java.io.IOException
import java.util.regex.Matcher
import java.util.regex.Pattern
public class GetBookInfoThread extends Thread{
private CloseableHttpClient httpClient
private String webAddress
private Element rootElement
private Pattern bookAuthorRegex
private Pattern bookPublishRegex
private Pattern bookIsbnRegex
private Pattern bookImgRegex
private String bookName
public GetBookInfoThread(CloseableHttpClient httpClient,String webAddress,String bookName,Element rootElement,Pattern bookAuthorRegex,Pattern bookPublishRegex,Pattern bookIsbnRegex,Pattern bookImgRegex) {
this.httpClient = httpClient
this.webAddress = webAddress
this.rootElement = rootElement
this.bookAuthorRegex = bookAuthorRegex
this.bookPublishRegex = bookPublishRegex
this.bookIsbnRegex = bookIsbnRegex
this.bookName = bookName
this.bookImgRegex = bookImgRegex
}
@Override
public void run() {
HttpGet getBookInfo = new HttpGet(webAddress)
getBookInfo.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30")
CloseableHttpResponse bookInfoResponse
String bookInfoCode = null
try {
bookInfoResponse = httpClient.execute(getBookInfo)
if (bookInfoResponse.getStatusLine().getStatusCode() != 200) {
System.out.println("获取书本具体信息时出错,页面地址:" + webAddress + "错误信息" + bookInfoResponse.getStatusLine())
return
}
bookInfoCode = EntityUtils.toString(bookInfoResponse.getEntity())
} catch (IOException e) {
e.printStackTrace()
}
Matcher bookAuthorMatcher = bookAuthorRegex.matcher(bookInfoCode)
Matcher bookPublishMatcher = bookPublishRegex.matcher(bookInfoCode)
Matcher bookIsbnMatcher = bookIsbnRegex.matcher(bookInfoCode)
Matcher bookImgMatcher = bookImgRegex.matcher(bookInfoCode)
String bookName = this.bookName
String bookAuthor = ""
String bookPublish = ""
String bookIsbn = ""
String bookLink = webAddress
String bookImg = ""
if (bookAuthorMatcher.find()) {
bookAuthor = bookAuthorMatcher.group(1)
}
if (bookPublishMatcher.find()) {
bookPublish = bookPublishMatcher.group(1)
}
if (bookIsbnMatcher.find()) {
bookIsbn = bookIsbnMatcher.group(1)
}
if (bookImgMatcher.find()) {
bookImg = bookImgMatcher.group(1)
}
// System.out.println(bookName + "-" + bookAuthor + "-" + bookPublish + "-" + bookIsbn)
Element bookElement = rootElement.addElement("book")
bookElement.addAttribute("id",String.valueOf(Main.bookId++))
bookElement.addElement("name").setText(bookName)
bookElement.addElement("author").setText(bookAuthor)
bookElement.addElement("publish").setText(bookPublish)
bookElement.addElement("isbn").setText(bookIsbn)
bookElement.addElement("count").setText(String.valueOf((int)(Math.random() * 10) + 3))
bookElement.addElement("link").setText(bookLink)
bookElement.addElement("img").setText(bookImg)
System.out.println("抓取了:" + webAddress + " " + bookName)
}
}
import org.apache.http.HttpHeaders
import org.apache.http.HttpHost
import org.apache.http.client.methods.CloseableHttpResponse
import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.dom4j.Document
import org.dom4j.DocumentHelper
import org.dom4j.Element
import java.io.IOException
import java.util.*
import java.util.regex.Matcher
import java.util.regex.Pattern
public class Main {
CloseableHttpClient httpClient
static int bookId = 496
Map<String,Integer> proxyMap
List<String> ipList
int i = 0
public static void main(String[] args) {
Main m = new Main()
// List<String> tagList = m.getTagList()
List<String> tagList = new LinkedList<String>()
// tagList.add("经典")
// tagList.add("日本文学")
// tagList.add("散文")
// tagList.add("中国文学")
// tagList.add("算法")
// tagList.add("童话")
// tagList.add("外国文学")
// tagList.add("文学")
// tagList.add("小说")
// tagList.add("漫画")
// tagList.add("诗词")
// tagList.add("心理学")
tagList.add("摄影")
tagList.add("理财")
tagList.add("经济学")
m.pullAndWrite(tagList,10)
}
public Main() {
// HttpHost proxy = new HttpHost("122.225.106.35",80)
// httpClient = HttpClients.custom().setProxy(proxy).build()
httpClient = HttpClients.createDefault()
setProxyMap()
}
public void setProxyMap() {
proxyMap = new HashMap<String, Integer>()
ipList = new LinkedList<String>()
proxyMap.put("211.68.122.171",80)
}
public List<String> getTagList() {
HttpGet getTag = new HttpGet("http://book.douban.com/tag/")
getTag.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30")
CloseableHttpResponse tagPageResponse = null
String tagPageCode = null
try {
tagPageResponse = httpClient.execute(getTag)
tagPageCode = EntityUtils.toString(tagPageResponse.getEntity())
tagPageResponse.close()
} catch (IOException e) {
e.printStackTrace()
} finally {
try {
tagPageResponse.close()
} catch (IOException e) {
e.printStackTrace()
}
}
Pattern p = Pattern.compile("class=\"tag\">(.*?)</a>")
Matcher m = p.matcher(tagPageCode)
List<String> resultTagList = new LinkedList<String>()
while (m.find()) {
resultTagList.add(m.group(1))
}
return resultTagList
}
public void pullAndWrite(List<String> tagList,int maxPageNum) {
Pattern bookAddressRegex = Pattern.compile("href=\"(.*?)\" class=\"title\" target=\"_blank\">(.*?)</a>")
Pattern bookAuthorRegex = Pattern.compile("(?s)<span class=\"pl\"> 作者</span>:.*?>(.*?)</a>")
Pattern bookPublishRegex = Pattern.compile("<span class=\"pl\">出版社:</span> (.*?)<br/>")
Pattern bookIsbnRegex = Pattern.compile("<span class=\"pl\">ISBN:</span> (.*?)<br/>")
Pattern bookImgRegex = Pattern.compile("<img src=\"(.*?)\" title=\"点击看大图\"")
//分别抓取每一种类别的书籍
for (String tag:tagList) {
int nowPageNum = 0
Document newDocument = DocumentHelper.createDocument()
Element rootElement = newDocument.addElement("root")
while (nowPageNum < maxPageNum) {
System.out.println(1)
String nowPageAddress = "http://www.douban.com/tag/" + tag + "/book?start=" + nowPageNum * 15
HttpGet getBooksPage = new HttpGet(nowPageAddress)
getBooksPage.addHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30")
CloseableHttpResponse booksPageResponse
Matcher m = null
try {
System.out.println(2)
booksPageResponse = httpClient.execute(getBooksPage)
System.out.println(3)
m = bookAddressRegex.matcher(EntityUtils.toString(booksPageResponse.getEntity()))
booksPageResponse.close()
if (booksPageResponse.getStatusLine().getStatusCode() != 200) {
System.out.println("抓 " + nowPageAddress + " 时出错:")
System.out.println("错误信息:" + booksPageResponse.getStatusLine())
changeProxy()
continue
}
} catch (IOException e) {
e.printStackTrace()
}
//具体每一本书,具体抓取
int findCount = 0
List<Thread> threadList = new LinkedList<Thread>()
while (m.find()) {
threadList.add(new GetBookInfoThread(httpClient, m.group(1), m.group(2), rootElement, bookAuthorRegex, bookPublishRegex, bookIsbnRegex,bookImgRegex))
findCount++
}
//没有知道到代表这种类别的书都找完了,那么直接退出此类书籍的查找
if (findCount == 0) {
break
}
for (Thread thread:threadList) {
thread.start()
}
for (Thread thread:threadList) {
try {
thread.join()
} catch (InterruptedException e) {
e.printStackTrace()
}
}
nowPageNum++
}
//一个类别爬完了再写入
new WriteBookInfoToFile(rootElement,"/home/geekgao/book/" + tag + ".xml").start()
}
}
private void changeProxy() {
if (i >= ipList.size()) {
System.out.println("代理用完了,退出")
System.exit(0)
}
String ip = ipList.get(i++)
httpClient = HttpClients.custom().setProxy(new HttpHost(ip,proxyMap.get(ip))).build()
System.out.println("换代理啦,使用代理:" + ip + ",端口:" + proxyMap.get(ip))
}
}
import org.dom4j.Element;
import org.dom4j.io.XMLWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
public class WriteBookInfoToFile extends Thread {
private Element root;
private String fileAddress;
public WriteBookInfoToFile(Element root,String fileAddress) {
this.root = root;
this.fileAddress = fileAddress;
}
@Override
public void run() {
Writer fileWriter;
try {
fileWriter = new FileWriter(fileAddress);
XMLWriter xmlWriter = new XMLWriter(fileWriter);
xmlWriter.write(root);
xmlWriter.close();
System.out.println("[" + fileAddress + "]写入成功");
} catch (IOException e) {
e.printStackTrace();
}
}
}
import org.dom4j.Document
import org.dom4j.DocumentException
import org.dom4j.Element
import org.dom4j.io.SAXReader
import java.io.File
import java.sql.DriverManager
import java.sql.SQLException
import java.sql.Statement
import java.util.List
public class WriteInfoToDB {
public static void main(String[] args) {
File folder = new File("/home/geekgao/book")
File[] XMLS = folder.listFiles()
SAXReader reader = new SAXReader()
Statement statement = null
try {
Class.forName("com.mysql.jdbc.Driver")
statement = DriverManager.getConnection("jdbc:mysql://localhost:3306/BookManage?user=root&password=root").createStatement()
} catch (SQLException e) {
e.printStackTrace()
} catch (ClassNotFoundException e) {
e.printStackTrace()
}
for (File f:XMLS) {
if (f.isDirectory()) {
continue
}
Document document = null
try {
document = reader.read(f)
} catch (DocumentException e) {
e.printStackTrace()
}
Element root = document.getRootElement()
List<Element> books = root.elements()
for (Element book:books) {
String name = null
String author = null
String publish = null
String isbn = null
String count = null
String link = null
String img = null
List<Element> b = book.elements()
for (Element info:b) {
if (info.getName().equals("name")) {
name = info.getText()
} else if (info.getName().equals("author")) {
author = info.getText()
} else if (info.getName().equals("publish")) {
publish = info.getText()
} else if (info.getName().equals("isbn")) {
isbn = info.getText()
} else if (info.getName().equals("count")) {
count = info.getText()
} else if (info.getName().equals("link")) {
link = info.getText()
} else if (info.getName().equals("img")) {
img = info.getText()
}
// System.out.println(info.getName() + ": " + info.getText())
}
String sql = "INSERT INTO Book(bookPublish,bookName,bookAuthor,bookTag,bookIsbn,bookCount,bookRestCount,bookLink,bookImg) VALUES ('" + publish + "','" + name + "','" + author + "','" + f.getName().split("\\.")[0] + "','" + isbn + "','" + count + "','" + count + "','" + link + "','" + img + "');"
try {
statement.execute(sql)
} catch (SQLException e) {
System.err.println("sql语句处错误:" + e.getMessage())
System.err.println("sql语句:" + sql)
}
}
}
}
}