从Webcollector + Spring + MVC 搭建应用初探(一)的代码内容中已经完成了基本的数据抓取
部分,但由于Webcollector的“”强大“”导致,按与前面的内容相同的抓取速率对up主信息进行
爬取会被禁掉,所以将前述depth 3 部分改用单线程,并使用Spring初探(七)中的
时间调度部分进行调度运行。
下面是代码,可以与Webcollector + Spring + MVC 搭建应用初探(一)中depth 3部分向对照:
package CrawlerGroup.crawl.AnchorCrawler;AnchorJDBCTemplate
import CrawlerGroup.crawl.JdbcManager.AnchorJDBCTemplate;
import org.apache.http.*;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import CrawlerGroup.crawl.JedisManager.RedisAPI;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import java.util.ArrayList;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.EntityUtils;
import org.json.JSONObject;
/**
* Created by admin on 2016/12/27.
*/
@Component
public class ScheduleTask {
private JedisPool pool = RedisAPI.getPool();
private final String anchorZSet = "Anchor:BiliBili";
private AnchorJDBCTemplate anchorJDBCTemplate;
public void setAnchorJDBCTemplate(AnchorJDBCTemplate anchorJDBCTemplate){this.anchorJDBCTemplate = anchorJDBCTemplate;}
@Scheduled(cron="0/5 * * * * ? ")
public void runCrawlerInstance() throws Exception{
Jedis jedis = pool.getResource();
Object[] mid_array = jedis.zrange(anchorZSet, 0, 0).toArray();
if (mid_array.length > 0)
{
String mid = (String) mid_array[0];
jedis.zrem(anchorZSet, mid);
String url = "http://space.bilibili.com/ajax/member/GetInfo";
HttpPost httppost = new HttpPost(url);
ArrayList<BasicNameValuePair> params = new ArrayList<>();
params.add(new BasicNameValuePair("mid", mid));
httppost.setEntity(new UrlEncodedFormEntity(params,HTTP.UTF_8));
httppost.setHeader("Host", "space.bilibili.com");
httppost.setHeader("Origin", "http://space.bilibili.com");
httppost.setHeader("Referer", String.format("http://space.bilibili.com/%s/", mid));
HttpResponse response=new DefaultHttpClient().execute(httppost);
if(response.getStatusLine().getStatusCode()==200) {//如果状态码为200,就是正常返回
String result = EntityUtils.toString(response.getEntity());
//得到返回的字符串
JSONObject json = new JSONObject(result);
//System.out.println("json : " + json);
String anchor = json.getJSONObject("data").getString("name");
int i_mid = Integer.parseInt(mid);
int fan_num = json.getJSONObject("data").getInt("fans");
int play_num = json.getJSONObject("data").getInt("playNum");
anchorJDBCTemplate.create(anchor, i_mid, play_num, fan_num);
}
}
}
}
package CrawlerGroup.crawl.JdbcManager;
import cn.edu.hfut.dmic.webcollector.fetcher.Executor;
import org.springframework.jdbc.core.JdbcTemplate;
import javax.sql.DataSource;
import java.util.List;
/**
* Created by admin on 2016/12/26.
*/
public class AnchorJDBCTemplate implements AnchorDAO {
private DataSource dataSource;
private JdbcTemplate jdbcTemplateObject;
public void setDataSource(DataSource dataSource)
{
this.dataSource = dataSource;
this.jdbcTemplateObject = new JdbcTemplate(dataSource);
}
public void create(String name, Integer tid, Integer play_num, Integer fan_num)
{
int count = jdbcTemplateObject.queryForObject("select count(*) from Anchor where TID=?;", new Object[] { tid }, Integer.class);
if (count != 0)
{
update(tid, play_num, fan_num);
return;
}
String SQL = "insert into Anchor (NAME, TID, PLAYNUM, FANNUM, CHANGETIME) values (?,?,?,?,NOW())";
jdbcTemplateObject.update(SQL, name, tid, play_num, fan_num);
System.out.println("Created Record Name = " + name + " Tid = " + tid);
}
public Anchor getAnchor(Integer tid)
{
String SQL = "select * from Anchor where TID = ?";
Anchor anchor = jdbcTemplateObject.queryForObject(SQL, new Object[]{tid}, new AnchorManager());
return anchor;
}
public List<Anchor> listAnchors(){
String SQL = "select * from Anchor";
List<Anchor> anchors = jdbcTemplateObject.query(SQL, new AnchorManager());
return anchors;
}
public void delete(Integer tid)
{
String SQL = "delete from Anchor where TID = ?";
jdbcTemplateObject.update(SQL, tid);
}
public void update(Integer tid, Integer play_num, Integer fan_num)
{
String SQL = "update Anchor set PLAYNUM = ? AND FANNUM = ? AND CHANGETIME = NOW() where TID = ?";
jdbcTemplateObject.update(SQL, play_num, fan_num, tid);
}
}
启动代码:
package CrawlerGroup.crawl.AnchorCrawler;
import org.springframework.beans.factory.BeanFactory;
import org.springframework.context.support.ClassPathXmlApplicationContext;
/**
* Created by ehang on 2016/12/27.
*/
public class Application {
public static void main(String[] args)throws Exception{
BeanFactory factory = new ClassPathXmlApplicationContext("Beans.xml");
}
}
配置文件:
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:context="http://www.springframework.org/schema/context"
xmlns:task="http://www.springframework.org/schema/task"
xsi:schemaLocation="http://www.springframework.org/schema/beans
http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
http://www.springframework.org/schema/task
http://www.springframework.org/schema/task/spring-task-3.0.xsd
http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-3.0.xsd">
<task:annotation-driven/>
<context:annotation-config/>
<bean class="org.springframework.beans.factory.annotation.AutowiredAnnotationBeanPostProcessor"/>
<context:component-scan base-package="CrawlerGroup.crawl.AnchorCrawler"/>
<bean id="scheduleTask" class="CrawlerGroup.crawl.AnchorCrawler.ScheduleTask" >
<property name="anchorJDBCTemplate" ref="anchorJDBCTemplate"/>
</bean>
<bean id="dataSource" class = "org.springframework.jdbc.datasource.DriverManagerDataSource" >
<property name = "driverClassName" value = "com.mysql.jdbc.Driver"/>
<property name = "url" value = "jdbc:mysql://localhost:3306/test"/>
<property name = "username" value = "root"/>
<property name = "password" value = ""/>
</bean>
<bean id="anchorJDBCTemplate" class = "CrawlerGroup.crawl.JdbcManager.AnchorJDBCTemplate" >
<property name = "dataSource" ref = "dataSource"/>
</bean>
</beans>
自此数据准备部分结束,下面的任务主要围绕Spring Web Service进行有关页面及数据交互的部分,
参见后续的该类别文章及Spring初探系列。(Webcollector + Spring + MVC 搭建应用初探(三))