public class Demo4 {
//private static final String url = "http://tieba.baidu.com/tb/picture/index.html";
private static final String url = "http://bizhi.sogou.com/cate/index/4?f=nav";
private static final String picPath = "d:/picTest";
public static void main(String[] args) {
try {
Document doc = Jsoup.connect(url).get();
Elements img = doc.select("img");
for (Element ele : img) {
String src = ele.absUrl("src");
//System.out.println(src);
getImage(src);
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("图片下载完成!");
}
private static void getImage(String src) {
int indexName = src.lastIndexOf("/");
String name = src.substring(indexName, src.length());
//System.out.println(name);
InputStream in = null;
OutputStream out = null;
try {
URL url = new URL(src);
in = url.openStream();
//创建文件夹
File files = new File(picPath);
if(!files.exists())
files.mkdirs();
out = new BufferedOutputStream(new FileOutputStream(files+name));
for(int b;(b=in.read())!=-1;)
out.write(b);
} catch (Exception e) {
e.printStackTrace();
}finally{
try {
out.close();
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
以上代码是利用jsoup可以抓取到某一个页面的图片,可是我想抓取整个网站的图片该怎么办啊?望众大神给个解决思路!小弟不胜感激!谢谢了!
6 个解决方案
#1
用正则,所谓的网页爬虫,抓到链接,再下载
#2
顶一楼,抓取链接下载
#3
主要有两个线程:图片url抓取线程、图片下载保存线程。
图片下载保存采用线程池处理,主要利用java的ThreadPoolExecutor实现。
图片下载保存采用线程池处理,主要利用java的ThreadPoolExecutor实现。
#4
url抓取线程:
package sys.gifspider;
import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sys.gifspider.utils.PropertyUtil;
public class GifSpider implements Runnable
{
volatile boolean isRunning = true;
private ThreadPoolExecutor threadPool;
BlockingQueue queue;
public GifSpider(BlockingQueue queue)
{
this.queue = queue;
this.init();
}
/**
* 线程池初始化
*/
private void init()
{
Properties pro = PropertyUtil.getProperties();
int corePoolSize = Integer.parseInt(pro.getProperty(threadpool.corePoolSize));
int maxPoolSize = Integer.parseInt(pro.getProperty(threadpool.maxPoolSize));
int keepAliveSeconds = Integer.parseInt(pro.getProperty(threadpool.keepAliveSeconds));
int queueCap = Integer.parseInt(pro.getProperty(threadpool.queueCapacity));
BlockingQueue queue = new LinkedBlockingQueue(queueCap);
this.threadPool = new ThreadPoolExecutor(
corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS,
queue);
}
public boolean isRunning()
{
return isRunning;
}
public void setRunning(boolean isRunning)
{
this.isRunning = isRunning;
}
@Override
public void run()
{
while (this.isRunning)
{
try
{
String url = this.queue.take();
System.out.println(请求url: + url);
Document doc = Jsoup.connect(url).get();
//获取所有
Elements s = doc.select(div.pic_list2).first().select(a[href]);
for (Element e : s)
{
//有img 和 文字 两种href,指向相同德图片,只过滤图片href就行了
Elements s1 = e.select(img);
if (s1.size() != 0)
{
String imgUrl = e.absUrl(href);
String text = s1.attr(alt);
Document doc1 = Jsoup.connect(imgUrl).get();
Elements e1 = doc1.getElementById(endtext).select(img);
//网页源码中是相对路径,要获取绝对路径
String realUrl = e1.attr(abs:src);
System.out.println(获取图片url: + realUrl);
//获取到图片url,扔给线程池处理
GifProcessor pro = new GifProcessor(text,realUrl);
this.threadPool.execute(pro);
}
}
Thread.sleep(1000);
} catch (InterruptedException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
}
}
}
package sys.gifspider;
import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sys.gifspider.utils.PropertyUtil;
public class GifSpider implements Runnable
{
volatile boolean isRunning = true;
private ThreadPoolExecutor threadPool;
BlockingQueue queue;
public GifSpider(BlockingQueue queue)
{
this.queue = queue;
this.init();
}
/**
* 线程池初始化
*/
private void init()
{
Properties pro = PropertyUtil.getProperties();
int corePoolSize = Integer.parseInt(pro.getProperty(threadpool.corePoolSize));
int maxPoolSize = Integer.parseInt(pro.getProperty(threadpool.maxPoolSize));
int keepAliveSeconds = Integer.parseInt(pro.getProperty(threadpool.keepAliveSeconds));
int queueCap = Integer.parseInt(pro.getProperty(threadpool.queueCapacity));
BlockingQueue queue = new LinkedBlockingQueue(queueCap);
this.threadPool = new ThreadPoolExecutor(
corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS,
queue);
}
public boolean isRunning()
{
return isRunning;
}
public void setRunning(boolean isRunning)
{
this.isRunning = isRunning;
}
@Override
public void run()
{
while (this.isRunning)
{
try
{
String url = this.queue.take();
System.out.println(请求url: + url);
Document doc = Jsoup.connect(url).get();
//获取所有
Elements s = doc.select(div.pic_list2).first().select(a[href]);
for (Element e : s)
{
//有img 和 文字 两种href,指向相同德图片,只过滤图片href就行了
Elements s1 = e.select(img);
if (s1.size() != 0)
{
String imgUrl = e.absUrl(href);
String text = s1.attr(alt);
Document doc1 = Jsoup.connect(imgUrl).get();
Elements e1 = doc1.getElementById(endtext).select(img);
//网页源码中是相对路径,要获取绝对路径
String realUrl = e1.attr(abs:src);
System.out.println(获取图片url: + realUrl);
//获取到图片url,扔给线程池处理
GifProcessor pro = new GifProcessor(text,realUrl);
this.threadPool.execute(pro);
}
}
Thread.sleep(1000);
} catch (InterruptedException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
}
}
}
#5
已经搞定,每次都post一个页码数据过去。
#6
xuexi xue学习了
#1
用正则,所谓的网页爬虫,抓到链接,再下载
#2
顶一楼,抓取链接下载
#3
主要有两个线程:图片url抓取线程、图片下载保存线程。
图片下载保存采用线程池处理,主要利用java的ThreadPoolExecutor实现。
图片下载保存采用线程池处理,主要利用java的ThreadPoolExecutor实现。
#4
url抓取线程:
package sys.gifspider;
import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sys.gifspider.utils.PropertyUtil;
public class GifSpider implements Runnable
{
volatile boolean isRunning = true;
private ThreadPoolExecutor threadPool;
BlockingQueue queue;
public GifSpider(BlockingQueue queue)
{
this.queue = queue;
this.init();
}
/**
* 线程池初始化
*/
private void init()
{
Properties pro = PropertyUtil.getProperties();
int corePoolSize = Integer.parseInt(pro.getProperty(threadpool.corePoolSize));
int maxPoolSize = Integer.parseInt(pro.getProperty(threadpool.maxPoolSize));
int keepAliveSeconds = Integer.parseInt(pro.getProperty(threadpool.keepAliveSeconds));
int queueCap = Integer.parseInt(pro.getProperty(threadpool.queueCapacity));
BlockingQueue queue = new LinkedBlockingQueue(queueCap);
this.threadPool = new ThreadPoolExecutor(
corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS,
queue);
}
public boolean isRunning()
{
return isRunning;
}
public void setRunning(boolean isRunning)
{
this.isRunning = isRunning;
}
@Override
public void run()
{
while (this.isRunning)
{
try
{
String url = this.queue.take();
System.out.println(请求url: + url);
Document doc = Jsoup.connect(url).get();
//获取所有
Elements s = doc.select(div.pic_list2).first().select(a[href]);
for (Element e : s)
{
//有img 和 文字 两种href,指向相同德图片,只过滤图片href就行了
Elements s1 = e.select(img);
if (s1.size() != 0)
{
String imgUrl = e.absUrl(href);
String text = s1.attr(alt);
Document doc1 = Jsoup.connect(imgUrl).get();
Elements e1 = doc1.getElementById(endtext).select(img);
//网页源码中是相对路径,要获取绝对路径
String realUrl = e1.attr(abs:src);
System.out.println(获取图片url: + realUrl);
//获取到图片url,扔给线程池处理
GifProcessor pro = new GifProcessor(text,realUrl);
this.threadPool.execute(pro);
}
}
Thread.sleep(1000);
} catch (InterruptedException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
}
}
}
package sys.gifspider;
import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sys.gifspider.utils.PropertyUtil;
public class GifSpider implements Runnable
{
volatile boolean isRunning = true;
private ThreadPoolExecutor threadPool;
BlockingQueue queue;
public GifSpider(BlockingQueue queue)
{
this.queue = queue;
this.init();
}
/**
* 线程池初始化
*/
private void init()
{
Properties pro = PropertyUtil.getProperties();
int corePoolSize = Integer.parseInt(pro.getProperty(threadpool.corePoolSize));
int maxPoolSize = Integer.parseInt(pro.getProperty(threadpool.maxPoolSize));
int keepAliveSeconds = Integer.parseInt(pro.getProperty(threadpool.keepAliveSeconds));
int queueCap = Integer.parseInt(pro.getProperty(threadpool.queueCapacity));
BlockingQueue queue = new LinkedBlockingQueue(queueCap);
this.threadPool = new ThreadPoolExecutor(
corePoolSize, maxPoolSize, keepAliveSeconds, TimeUnit.SECONDS,
queue);
}
public boolean isRunning()
{
return isRunning;
}
public void setRunning(boolean isRunning)
{
this.isRunning = isRunning;
}
@Override
public void run()
{
while (this.isRunning)
{
try
{
String url = this.queue.take();
System.out.println(请求url: + url);
Document doc = Jsoup.connect(url).get();
//获取所有
Elements s = doc.select(div.pic_list2).first().select(a[href]);
for (Element e : s)
{
//有img 和 文字 两种href,指向相同德图片,只过滤图片href就行了
Elements s1 = e.select(img);
if (s1.size() != 0)
{
String imgUrl = e.absUrl(href);
String text = s1.attr(alt);
Document doc1 = Jsoup.connect(imgUrl).get();
Elements e1 = doc1.getElementById(endtext).select(img);
//网页源码中是相对路径,要获取绝对路径
String realUrl = e1.attr(abs:src);
System.out.println(获取图片url: + realUrl);
//获取到图片url,扔给线程池处理
GifProcessor pro = new GifProcessor(text,realUrl);
this.threadPool.execute(pro);
}
}
Thread.sleep(1000);
} catch (InterruptedException e)
{
e.printStackTrace();
} catch (IOException e)
{
e.printStackTrace();
}
}
}
}
#5
已经搞定,每次都post一个页码数据过去。
#6
xuexi xue学习了