/**
*
*/
package 新浪微博爬虫;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Random;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.remote.DesiredCapabilities;
import org.openqa.selenium.remote.RemoteWebDriver;
/*
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
driver = webdriver.Chrome(chrome_options=options)
/
/**
* @author Administrator
*
*/
public class Spider {
/**
* @param args
* @throws IOException
* @throws InterruptedException
*/
public static void main(String[] args) throws IOException, InterruptedException {
long waitLoadBaseTime = 10000;
int waitLoadRandomTime = 3000;
Random random = new Random(System.currentTimeMillis());
// 设置 chrome 的路径
System.setProperty(
"webdriver.chrome.driver",
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe");
// 创建一个 ChromeDriver 的接口,用于连接 Chrome
@SuppressWarnings("deprecation")
ChromeDriverService service = new ChromeDriverService.Builder()
.usingDriverExecutable(
new File(
"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe"))
.usingAnyFreePort().build();
service.start();
// 创建一个 Chrome 的浏览器实例
WebDriver driver = new RemoteWebDriver(service.getUrl(),
DesiredCapabilities.chrome());
// 让浏览器访问微博主页
driver.get("http://weibo.com/u/6033327972");
//等待页面动态加载完毕
Thread.sleep(waitLoadBaseTime+random.nextInt(waitLoadRandomTime));
//选择每条微博的整体子模块
List<WebElement> elements = driver.findElements(By.cssSelector("div[action-type=feed_list_item]"));
//选择每条微博的文本内容模块
List<WebElement> elements2 = driver.findElements(By.cssSelector("div[node-type=feed_list_reason],div[node-type=feed_list_content]"));
System.out.println(elements.size());
for (int i = 0; i < elements.size(); i++) {
//展开评论
elements.get(i).findElement(By.cssSelector("a[action-type=fl_comment]")).click();;
Thread.sleep(1000);
}
//评论列表
List<WebElement> elements3 = driver.findElements(By.cssSelector("div[node-type=feed_list_commentList]"));
System.out.println(elements3.size());
int a = 0;
for (int i =0;i<elements2.size()&&a<elements3.size();i++) {
//抓取内容
String content = elements2.get(i).getText();
if (!content.contains("转发微博")) {
System.out.println("content:"+content);
//抓取评论
if (elements3.get(a).getText().isEmpty()) {
System.out.println("comment:no comment");
}else{
System.out.println("comment:"+elements3.get(a).getText());
}
a++;
}
}
driver.quit();
// 关闭 ChromeDriver 接口
service.stop();
}
}
程序已启动,弹出chromediver.exe已停止工作
结束进程
弹出错误
Starting ChromeDriver (v2.7.236900) on port 15302
Exception in thread "main" org.openqa.selenium.remote.UnreachableBrowserException: Error communicating with the remote browser. It may have died.
Build info: version: '2.42.2', revision: '6a6995d', time: '2014-06-03 17:42:30'
System info: host: 'MS-20160515FWNU', ip: '192.168.1.190', os.name: 'Windows 7', os.arch: 'amd64', os.version: '6.1', java.version: '1.8.0_92'
Driver info: driver.version: RemoteWebDriver
at org.openqa.selenium.remote.RemoteWebDriver.execute(RemoteWebDriver.java:593)
at org.openqa.selenium.remote.RemoteWebDriver.get(RemoteWebDriver.java:304)
at 新浪微博爬虫.Spider.main(Spider.java:54)
Caused by: java.net.SocketException: Connection reset
at java.net.SocketInputStream.read(SocketInputStream.java:209)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:136)
at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:152)
at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:270)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:140)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:57)
at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:260)
at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:161)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.http.impl.conn.CPoolProxy.invoke(CPoolProxy.java:138)
at com.sun.proxy.$Proxy0.receiveResponseHeader(Unknown Source)
at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:271)
at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:123)
at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:254)
at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:195)
at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:85)
at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:108)
at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:186)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:72)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:57)
at org.openqa.selenium.remote.HttpCommandExecutor.fallBackExecute(HttpCommandExecutor.java:204)
at org.openqa.selenium.remote.HttpCommandExecutor.execute(HttpCommandExecutor.java:173)
at org.openqa.selenium.remote.RemoteWebDriver.execute(RemoteWebDriver.java:572)
... 2 more
经确认,地址没错。
最新版chrome 最新版本chromediver selenium-2.42.2
求大神解惑!!
6 个解决方案
#1
哇 写爬虫 好厉害..
org.openqa.selenium什么工具包 0-0
org.openqa.selenium什么工具包 0-0
#2
遇到相同问题,求大神赐教!
#3
都这么溜 但我不知道
#4
用selenium来写爬虫,深深的震惊了。
#5
你都都是大神,我是菜鸟
#6
楼主解决了么?我遇到同样的问题
#1
哇 写爬虫 好厉害..
org.openqa.selenium什么工具包 0-0
org.openqa.selenium什么工具包 0-0
#2
遇到相同问题,求大神赐教!
#3
都这么溜 但我不知道
#4
用selenium来写爬虫,深深的震惊了。
#5
你都都是大神,我是菜鸟
#6
楼主解决了么?我遇到同样的问题