关于淘宝网评论数据的抓取
第一步 如何获取商品基本信息
我们打开多张淘宝的商品网页,分析网页的URL组成,寻找其中的规律,下面给出一个例子
我们发现了一个规律就是http://item.taobao.com/item.htm?id=商品id
(这个只适用于大部分商品)
第二步 观察浏览器是如何获取指定商品的评论数据
我们在浏览器中是可以浏览商品的评论数据的,我可以借助chrome浏览器的开发者工具,监测我们在浏览商品的评论数据的时候访问了那些链接,分析出我们是从哪一个链接中获得了评论数据,这个是关键点。下面是分析过程
我们从上图可以知道,评论数据是怎么来的了,关键是要弄清楚这个评论数据的url是怎么来的就好说了,这是我们接下来要解决的文件
第三步 评论数据的url怎么来的
对于http://rate.taobao.com/detailCommon.htm?callback=jsonp_reviews_summary2&userNumId=23131616&auctionNumId=44490418808&siteID=4&ua=
这么一个连接,不可能是凭空产生的,那么它唯一的来源就是商品页面,就包含了它,或者它的各个部分,那么我就去分析商品页面的源码,其实源码其实很多,不好找,把 rate.taobao.com/detailCommon.htm 这一段,作为关键字去搜索就得到了
我们在这个地方就可以得到了 淘宝评论的url,
<div id="reviews"
data-reviewApi="//rate.taobao.com/detail_rate.htm?userNumId=23131616&auctionNumId=44490418808&showContent=1¤tPage=1&ismore=0&siteID=4"
data-reviewCountApi=""
data-listApi="//rate.taobao.com/feedRateList.htm?userNumId=23131616&auctionNumId=44490418808&siteID=4"
data-commonApi="//rate.taobao.com/detailCommon.htm?userNumId=23131616&auctionNumId=44490418808&siteID=4"
data-usefulApi="//rate.taobao.com/vote_useful.htm?userNumId=23131616&auctionNumId=44490418808">
</div>
那么接下来就看程序实现了
首先我们要模拟浏览器去访问一个淘宝的商品页面,那么就用HttpClient,
private static List<CommentInfo> grabData(String id) {
// TODO Auto-generated method stub
String goodUrl=baseUrl+id;
log("开始分析商品 "+id+"网页数据:"+goodUrl);
String html = null;
// 打开商品所在的html页面
try {
html = Client.sendGet(goodUrl);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// 如果商品页面打开失败则返回
if(html==null)
{
log("打开商品页面失败,"+goodUrl);
return null;
}
// 分析商品页面,提取其中的评论数据的url
String url= analysis(html);
// String temp="http://rate.taobao.com/feedRateList.htm?_ksTS=1430057796067_1323&callback=jsonp_reviews_list&userNumId=1753146414&auctionNumId=39232136537&siteID=1¤tPageNum=1&rateType=&orderType=sort_weight&showContent=1&attribute=&ua=";
if(url==null)
{
log("提取商品评论数据url失败,终止分析");
return null;
}
// 获得商品的评论数据
List<CommentInfo> list=analysisComments(url);
return list;
}
我们访问了商品页面,那么就要获得其中的评论数据的url,
关键在于 找到 <div id="reviews"
这个标签,然后再去访问这个标签的
data-listApi="//rate.taobao.com/feedRateList.htm?userNumId=23131616&auctionNumId=44490418808&siteID=4"
这个属性的值,这个值就是我们要的评论数据url了。这个使用JSoup就可以用上了
/**
* 根据商品的html获得商品评论数据的url
* @param html 商品的html
* @return 评论数据的url,如果无法获取的话,就返回null
*/
public static String analysis(String html)
{
Document doc=Jsoup.parse(html);
Element e=doc.getElementById("reviews");
if(e!=null)
{
System.out.println(e.html());
String url=e.attr("data-listapi");
System.out.println(url);
return url;
}
return null;
}
到这里,一切都很明白了,以下是全部代码
主类
/** * author:tanqidong * create Time:2015年4月26日,下午8:53:03 * description: * fileName:DataObtainer.java */
package com.computer.test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.client.ClientProtocolException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.computer.app.CommentInfoApp;
import com.computer.entity.CommentInfo;
import com.computer.net.Client;
import com.google.gson.Gson;
/** * @author * */
public class DataObtainer {
/** * 商品基本的url */
private static final String baseUrl="http://item.taobao.com/item.htm?id=";
/** * 查询评论附带的参数 */
private static final String urlCondition="&callback=jsonp_reviews_list&rateType=&orderType=sort_weight&showContent=1&attribute=&ua=¤tPageNum=";
/** * 输出日志标签 */
private static final String defaultTag = "DataObtainer";
/* * 默认抓取的评论条数 */
private static final int defaultCommentCount=100;
/** * @param args * @throws IOException * @throws ClientProtocolException */
public static void main(String[] args) throws ClientProtocolException, IOException {
// TODO Auto-generated method stub
// 这里修改商品id,做测试
String id="44413860299";
// 抓取商品评论数据
List<CommentInfo> list=grabData(id);
if(list==null)
{
log("获取评论数据失败");
System.exit(1);
}
if(list.size()==0)
{
log("获取评论数据条数为0");
System.exit(1);
}
// DBConnection db=new DBConnection();
CommentInfoApp cia=new CommentInfoApp();
log("抓取的条数:"+list.size());
// 预览商品评论数据
for(CommentInfo ci:list)
{
log(ci.getDate()+":"+ci.getContent());
ci.setTaoBaoId(id);
// 保存到数据库
// db.insert(ci);
cia.addComment(ci);
}
}
/** * 根据商品数据,获取商品的评论数据 * @param id 商品id */
private static List<CommentInfo> grabData(String id) {
// TODO Auto-generated method stub
String goodUrl=baseUrl+id;
log("开始分析商品 "+id+"网页数据:"+goodUrl);
String html = null;
// 打开商品所在的html页面
try {
html = Client.sendGet(goodUrl);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// 如果商品页面打开失败则返回
if(html==null)
{
log("打开商品页面失败,"+goodUrl);
return null;
}
// 分析商品页面,提取其中的评论数据的url
String url= analysis(html);
// String temp="http://rate.taobao.com/feedRateList.htm?_ksTS=1430057796067_1323&callback=jsonp_reviews_list&userNumId=1753146414&auctionNumId=39232136537&siteID=1¤tPageNum=1&rateType=&orderType=sort_weight&showContent=1&attribute=&ua=";
if(url==null)
{
log("提取商品评论数据url失败,终止分析");
return null;
}
// 获得商品的评论数据
List<CommentInfo> list=analysisComments(url);
return list;
}
/** * 分析获取评论数据对象 * @param url 评论数据的url * @return 评论数据对象列表,如果获取失败则返回null */
private static List<CommentInfo> analysisComments(String url) {
// TODO Auto-generated method stub
/* * 由于评论有多个页面,暂时无法获取评论的总分页数,所以这里采用循环,当获取的页面数据为null,即该页的评论数据条数为0,那么 * 那么就认为该页,是评论的最后一页,结束循环 */
String html=null;
int page=1;//默认从第一页开始获取商品评论数据
// 商品评论数据对象总列表
List<CommentInfo> data=new ArrayList<CommentInfo>();
while(true){
log("开始分析商品评论数据页面:"+page);
// 根据页号,获得商品评论数据
html=getComments(url,page);
log(html);
// 如果商品评论页面打开失败则返回
if(html==null)
{
log("获取商品评论页面失败,结束获取");
return null;
}
// 对商品评论数据的分析,预处理获得标准的json格式
// String []term=html.split("\\(");//根据(分割字符串
html.replace("jsonp_reviews_list(", "");
//if(term.length>=2 && term[1].length()>0)
if(html.contains("jsonp_reviews_list("))
{
//term[1]=term[1].trim();//去掉字符串 前后的空格
//String jsonData=term[1].substring(0, term[1].length()-1); //去掉字符串后面的)符号
String jsonData=html.replace("jsonp_reviews_list(", "");
jsonData=jsonData.trim();
jsonData=jsonData.substring(0, jsonData.length()-1); //去掉字符串后面的)符号
// 解析json 字符串,获得评论数据对象列表
List<CommentInfo> list=analysisJson(jsonData);
// 如果对象列表为空,则结束循环
if(list==null)
{
log("当前页面:"+page+" 已经没有更多数据了,结束获取");
// 结束获取评论页面的循环
break;
}
else
{
//将获得的评论数据列表添加到总数据列表
if(data.size()+list.size()<=defaultCommentCount)
data.addAll(list);
else
{
for(int i=0;i<list.size();i++)
{
data.add(list.get(i));
if(data.size()>=defaultCommentCount)
{
break;
}
}
}
if(data.size()>=defaultCommentCount)
{
break;
}
}
}
else
{
log("获得的商品数据格式无法解析:"+html);
log("结束获取");
// 数据抓取过程中出现错误,终止循环
break;
}
// 页号自增
page++;
}
return data;
}
/** * @param url 商品评论的url * @param page 商品评论的页号 * @return 商品页面数据 */
private static String getComments(String url, int page) {
// TODO Auto-generated method stub
String html=null;
String commentUrl=url+urlCondition+page;
if(!commentUrl.startsWith("http:"))
commentUrl="http:"+commentUrl;
try {
html=Client.sendGet(commentUrl,"GBK");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
log("打开页面失败! "+commentUrl);
}
// System.out.println(html);
return html;
}
/** * 将评论的json字符串,解析成对象 * @param html json字符串 * @return CommentInfo对象列表 */
public static List<CommentInfo> analysisJson(String html)
{
List<CommentInfo> list=new ArrayList<CommentInfo>();
// Gson gson=new Gson();
//JsonData data=gson.fromJson(html, JsonData.class);
JSONArray ja=null;
try{
JSONObject jo=JSONObject.parseObject(html);
ja=jo.getJSONArray("comments");
}catch(Exception e)
{
log(html);
}
if(ja==null)
{
return null;
}
for(int i=0;i<ja.size();i++)
{
JSONObject jai=ja.getJSONObject(i);
String content= jai.getString("content");
String date =jai.getString("date");
CommentInfo ci=new CommentInfo(date, content);
//System.out.println(content+"-->"+date);
list.add(ci);
}
return list;
}
/** * 根据商品的html获得商品评论数据的url * @param html 商品的html * @return 评论数据的url,如果无法获取的话,就返回null */
public static String analysis(String html)
{
Document doc=Jsoup.parse(html);
Element e=doc.getElementById("reviews");
if(e!=null)
{
System.out.println(e.html());
String url=e.attr("data-listapi");
System.out.println(url);
return url;
}
return null;
}
private static void log(String ss)
{
log(defaultTag,ss);
}
private static void log(String tag,String ss)
{
System.out.println(tag+": "+ss);
}
}
网络访问类
package com.computer.net;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.client.protocol.ClientContext;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.BufferedHttpEntity;
import org.apache.http.entity.mime.MultipartEntityBuilder;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
public class Client {
public static HttpClient httpClient=new DefaultHttpClient();
private static Cookie mCookie=null;
private static CookieStore mCookieStore=null;
public static HttpClient getHttpClient()
{
if(httpClient==null)
{
httpClient=new DefaultHttpClient();
httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
// User-Agent Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; Shuame)
httpClient.getParams().setParameter("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; Shuame)");
/* // ��֤������ // ��������Ϲд�ģ������ʵ�������д httpClient.getCredentialsProvider().setCredentials(new AuthScope("10.60.8.20", 8080), new UsernamePasswordCredentials("username", "password")); // ���ʵ�Ŀ��վ�㣬�˿ں�Э�� HttpHost targetHost = new HttpHost("www.google.com", 443, "https"); // ���������? HttpHost proxy = new HttpHost("10.60.8.20", 8080); httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy); // Ŀ����? HttpGet httpget = new HttpGet("/adsense/login/zh_CN/?"); System.out.println("Ŀ��: " + targetHost); System.out.println("����: " + httpget.getRequestLine()); System.out.println("����: " + proxy); */
}
return httpClient;
}
public static byte[] sendGet_byte(String url)
{
getHttpClient();
byte[] data=null;
httpClient=new DefaultHttpClient();
HttpGet httpGet=new HttpGet(url);
InputStream is=null;
HttpContext httpContext=new BasicHttpContext();
if(mCookieStore==null)
{
mCookieStore=new BasicCookieStore();
}
httpContext.setAttribute(ClientContext.COOKIE_STORE, mCookieStore);
HttpResponse httpResponse;
try {
httpResponse = httpClient.execute(httpGet,httpContext);
HttpEntity httpEntity=httpResponse.getEntity();
if(httpEntity!=null)
{
is=httpEntity.getContent();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if(is!=null)
{
data=readStream(is);
}
return data;
}
public static String sendGet(String url) throws ClientProtocolException, IOException{
getHttpClient();
String result = null;
httpClient = new DefaultHttpClient();
HttpGet get = new HttpGet(url);
InputStream in = null;
HttpContext context = new BasicHttpContext();
if(mCookieStore==null)
mCookieStore = new BasicCookieStore();
context.setAttribute(ClientContext.COOKIE_STORE, mCookieStore);
try {
// long t1=System.currentTimeMillis();
HttpResponse response = httpClient.execute(get,context);
// System.out.println(System.currentTimeMillis()-t1);
List<Cookie> cookies = mCookieStore.getCookies();
if (!cookies.isEmpty()) {
for (int i = cookies.size(); i > 0; i --) {
Cookie cookie = cookies.get(i - 1);
if (cookie.getName().equalsIgnoreCase("IRSSessionID"))
{
mCookie=cookie;
System.out.println(cookie.getValue());
}
}
}
HttpEntity entity = response.getEntity();
if (entity != null) {
entity = new BufferedHttpEntity(entity);
in = entity.getContent();
byte[] read = new byte[1024];
byte[] all = new byte[0];
int num;
while ((num = in.read(read)) > 0) {
byte[] temp = new byte[all.length + num];
System.arraycopy(all, 0, temp, 0, all.length);
System.arraycopy(read, 0, temp, all.length, num);
all = temp;
}
result = new String(all, "UTF-8");
}
} finally {
if (in != null) in.close();
get.abort();
}
return result;
}
public static String sendGet(String url,String charset) throws ClientProtocolException, IOException{
getHttpClient();
String result = null;
httpClient = new DefaultHttpClient();
HttpGet get = new HttpGet(url);
InputStream in = null;
HttpContext context = new BasicHttpContext();
if(mCookieStore==null)
mCookieStore = new BasicCookieStore();
context.setAttribute(ClientContext.COOKIE_STORE, mCookieStore);
try {
// long t1=System.currentTimeMillis();
HttpResponse response = httpClient.execute(get,context);
// System.out.println(System.currentTimeMillis()-t1);
List<Cookie> cookies = mCookieStore.getCookies();
if (!cookies.isEmpty()) {
for (int i = cookies.size(); i > 0; i --) {
Cookie cookie = cookies.get(i - 1);
if (cookie.getName().equalsIgnoreCase("IRSSessionID"))
{
mCookie=cookie;
System.out.println(cookie.getValue());
}
}
}
HttpEntity entity = response.getEntity();
if (entity != null) {
entity = new BufferedHttpEntity(entity);
in = entity.getContent();
byte[] read = new byte[1024];
byte[] all = new byte[0];
int num;
while ((num = in.read(read)) > 0) {
byte[] temp = new byte[all.length + num];
System.arraycopy(all, 0, temp, 0, all.length);
System.arraycopy(read, 0, temp, all.length, num);
all = temp;
}
result = new String(all, charset);
}
} finally {
if (in != null) in.close();
get.abort();
}
return result;
}
public static String sendPost(String url, Map<String, String> params) throws ClientProtocolException, IOException{
String result = null;
HttpClient httpClient = new DefaultHttpClient();
HttpPost get = new HttpPost(url);
// �����������б�
List<NameValuePair> qparams = new ArrayList<NameValuePair>();
Set<String> keys = params.keySet();
for (String key : keys) {
qparams.add(new BasicNameValuePair(key, params.get(key)));
}
// ����
get.setEntity(new UrlEncodedFormEntity(qparams,"GBK"));
HttpContext httpContext=new BasicHttpContext();
if(mCookieStore==null)
mCookieStore = new BasicCookieStore();
// mCookieStore.addCookie(mCookie);
httpContext.setAttribute(ClientContext.COOKIE_STORE, mCookieStore);
HttpResponse response = httpClient.execute(get,httpContext);
HttpEntity entity = response.getEntity();
if (entity != null) {
entity = new BufferedHttpEntity(entity);
InputStream in = entity.getContent();
//byte[] all= readStream(in);
byte[] read = new byte[1024];
byte[] all = new byte[0];
int num;
while ((num = in.read(read)) > 0) {
byte[] temp = new byte[all.length + num];
System.arraycopy(all, 0, temp, 0, all.length);
System.arraycopy(read, 0, temp, all.length, num);
all = temp;
}
result = new String(all,"UTF-8");
if (null != in) {
in.close();
}
}
get.abort();
return result;
}
public static String sendPost(String url, Map<String, String> params,String charset) throws ClientProtocolException, IOException{
String result = null;
HttpClient httpClient = new DefaultHttpClient();
HttpPost get = new HttpPost(url);
// �����������б�
List<NameValuePair> qparams = new ArrayList<NameValuePair>();
if(params!=null)
{
Set<String> keys = params.keySet();
for (String key : keys) {
qparams.add(new BasicNameValuePair(key, params.get(key)));
}
}
// ����
get.setEntity(new UrlEncodedFormEntity(qparams,charset));
//get.addHeader("Content-Type", "application/json; charset=UTF-8");
HttpContext httpContext=new BasicHttpContext();
if(mCookieStore==null)
mCookieStore = new BasicCookieStore();
// mCookieStore.addCookie(mCookie);
httpContext.setAttribute(ClientContext.COOKIE_STORE, mCookieStore);
HttpResponse response = httpClient.execute(get,httpContext);
HttpEntity entity = response.getEntity();
if (entity != null) {
entity = new BufferedHttpEntity(entity);
InputStream in = entity.getContent();
//byte[] all= readStream(in);
byte[] read = new byte[1024];
byte[] all = new byte[0];
int num;
while ((num = in.read(read)) > 0) {
byte[] temp = new byte[all.length + num];
System.arraycopy(all, 0, temp, 0, all.length);
System.arraycopy(read, 0, temp, all.length, num);
all = temp;
}
result = new String(all,charset);
if (null != in) {
in.close();
}
}
get.abort();
return result;
}
/** * @param is * @return * @throws IOException ��� * @Description: */
private static byte[] readStream(InputStream is)
{
int tmp=0;
byte[] buffer=new byte[1024];
ByteArrayOutputStream baos=new ByteArrayOutputStream();
try {
while((tmp=is.read(buffer))!=-1)
{
baos.write(buffer, 0, tmp);
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return baos.toByteArray();
}
public static String sendGet(String url, Map<String, String> params) throws ClientProtocolException, IOException {
Set<String> keys = params.keySet();
StringBuilder urlBuilder = new StringBuilder(url + "?");
for (String key : keys) {
urlBuilder.append(key).append("=").append(params.get(key)).append("&");
}
urlBuilder.delete(urlBuilder.length() - 1, urlBuilder.length());
return sendGet(urlBuilder.toString());
}
/* public static HttpClient getHttpClient(Context paramContext) { BasicHttpParams localBasicHttpParams = new BasicHttpParams(); HttpConnectionParams.setConnectionTimeout(localBasicHttpParams, 50000); HttpConnectionParams.setSoTimeout(localBasicHttpParams, 200000); DefaultHttpClient localDefaultHttpClient = new DefaultHttpClient(localBasicHttpParams); if (!((WifiManager)paramContext.getSystemService("wifi")).isWifiEnabled()) { Uri localUri = Uri.parse("content://telephony/carriers/preferapn"); Cursor localCursor = paramContext.getContentResolver().query(localUri, null, null, null, null); if ((localCursor != null) && (localCursor.moveToFirst())) { String str = localCursor.getString(localCursor.getColumnIndex("proxy")); if ((str != null) && (str.trim().length() > 0)) { HttpHost localHttpHost = new HttpHost(str, 80); localDefaultHttpClient.getParams().setParameter("http.route.default-proxy", localHttpHost); } localCursor.close(); } } return localDefaultHttpClient; } */
public static String uploadFile(String url,String fileName,File file, Map<String, String> params, String charset) throws IOException
{
MultipartEntityBuilder meb=MultipartEntityBuilder.create();
meb.addBinaryBody(fileName, file);
getHttpClient();
Set<String> keys = params.keySet();
for (String key : keys) {
meb.addTextBody(key, params.get(key));
}
HttpPost get=new HttpPost(url);
get.setEntity(meb.build());
HttpContext httpContext=new BasicHttpContext();
if(mCookieStore==null)
mCookieStore = new BasicCookieStore();
// mCookieStore.addCookie(mCookie);
httpContext.setAttribute(ClientContext.COOKIE_STORE, mCookieStore);
HttpResponse response = null;
try {
response = httpClient.execute(get,httpContext);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
HttpEntity entity = response.getEntity();
String result = null;
if (entity != null) {
entity = new BufferedHttpEntity(entity);
InputStream in = entity.getContent();
//byte[] all= readStream(in);
byte[] read = new byte[1024];
byte[] all = new byte[0];
int num;
while ((num = in.read(read)) > 0) {
byte[] temp = new byte[all.length + num];
System.arraycopy(all, 0, temp, 0, all.length);
System.arraycopy(read, 0, temp, all.length, num);
all = temp;
}
result = new String(all,charset);
if (null != in) {
in.close();
}
}
get.abort();
return result;
}
}
需要的库主要是 httpclient,jsoup,fastjson,gson,