- 代码清单:
- ==============================SQL====================================
- 计划任务表
- ==============================定时任务模块类====================================
- 计划管理DAO接口 CmsSchedulerDao.java
- 计划管理DAO接口实现类 CmsSchedulerDaoImpl.java
- 计划任务管理服务接口 CmsSchedulerMng.java
- 计划任务管理服务接口实现类 CmsSchedulerMngImpl.java
- 定时任务管理接口 SchedulerTaskManageSvc.java
- 定时任务管理接口实现类 SchedulerTaskManageSvcImpl.java
- 定时任务接口 SchedulerTaskSvc.java
- 定时任务抽象实现类 AbstractSchedulerTaskSvc.java
- 定时任务接口-采集器实现类-多线程版 SchedulerAcquisitionSvcImpl.java
- 定时服务关联任务bean SchedulerTaskBean.java
- 计划任务Controller CmsSchedulerAct.java
- 持久对象基类 BaseCmsScheduler.java
- 持久对象 CmsScheduler.java
- HBM文件 CmsScheduler.hbm.xml
- ==============================定时任务模块相关互助类====================================
- 计划框架
- 计划框架-任务调度 Scheduler.java
- 计划框架-时间生成器接口 ScheduleIterator.java
- 计划任务抽象类 SchedulerTask.java
- 计划框架-时间生成器接口实现类 SimpleScheduleIterator.java
- 时间计划参数bean ScheduleParamBean.java
- 采集相关
- HTML解析工具类接口 ParseHtmlTool.java
- HTML解析工具,HtmlParser实现类 HtmlParserImpl.java
- 采集参数封装bean ParamBean.java
- 队列 Queue.java
- URL队列 UrlQueue.java
- 接下来是XML配置
- ==============================定时任务模块XML配置====================================
- dao配置
- <bean id="cmsSchedulerDao" class="com.jeecms.cms.dao.assist.impl.CmsSchedulerDaoImpl"/>
- manage配置
- <bean id="cmsSchedulerMng" class="com.jeecms.cms.manager.assist.impl.CmsSchedulerMngImpl"/>
- SERVICE配置
- <bean id="schedulerAcquisitionSvc" class="com.jeecms.cms.service.scheduler.SchedulerAcquisitionSvcImpl"/>
- <bean id="schedulerTaskManageSvc" class="com.jeecms.cms.service.scheduler.SchedulerTaskManageSvcImpl"/>
- 接下来是messages_zh_CN.properties 添加了常量
- ==============================messages_zh_CN.properties====================================
- messages_zh_CN.properties
- 接下来是模板
- ==============================模板====================================
- generate_left.html 有修改
- scheduler/add.html
- scheduler/edit.html
- scheduler/list.html
具体代码如下:
- ==============================SQL====================================
- 1:计划任务表
- /*
- MySQL Data Transfer
- Source Host: localhost
- Source Database: jeecms
- Target Host: localhost
- Target Database: jeecms
- Date: 2011-11-8 11:36:55
- */
- SET FOREIGN_KEY_CHECKS=0;
- -- ----------------------------
- -- Table structure for jc_scheduler
- -- ----------------------------
- CREATE TABLE `jc_scheduler` (
- `scheduler_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '任务主键',
- `site_id` int(11) DEFAULT NULL,
- `associate_id` int(11) DEFAULT NULL COMMENT '相关ID',
- `module_type` varchar(100) DEFAULT NULL COMMENT '模块类型',
- `name` varchar(100) DEFAULT NULL COMMENT '任务名称',
- `start_time` datetime DEFAULT NULL COMMENT '开始时间',
- `end_time` datetime DEFAULT NULL COMMENT '结束时间',
- `status` int(1) NOT NULL DEFAULT '0' COMMENT '当前状态(0:静止;1:采集)',
- `expression` varchar(50) NOT NULL COMMENT '计划表达式',
- PRIMARY KEY (`scheduler_id`)
- ) ENGINE=InnoDB AUTO_INCREMENT=10 DEFAULT CHARSET=utf8;
- -- ----------------------------
- -- Records
- -- ----------------------------
- INSERT INTO `jc_scheduler` VALUES ('4', '1', '1', 'schedulerAcquisitionSvc', '测试', '2011-11-07 18:02:30', '2011-11-07 18:04:00', '0', '*,*,*,*,3,0');
- INSERT INTO `jc_scheduler` VALUES ('8', '1', '5', 'schedulerAcquisitionSvc', '测试采集java', '2011-11-08 10:25:15', '2011-11-08 10:27:04', '0', '*,*,*,*,26,0');
- INSERT INTO `jc_scheduler` VALUES ('9', '1', '1', 'schedulerAcquisitionSvc', '测试采集新闻', '2011-11-08 10:37:58', '2011-11-08 10:38:11', '0', '*,*,*,*,38,0');
- ==============================定时任务模块类====================================
- 计划管理DAO接口 CmsSchedulerDao.java
- package com.jeecms.cms.dao.assist;
- import java.util.List;
- import com.jeecms.cms.entity.assist.CmsScheduler;
- import com.jeecms.common.hibernate3.Updater;
- /**
- * 计划管理DAO接口
- * @author javacoo
- * @since 2011-11-07
- */
- public interface CmsSchedulerDao {
- public List<CmsScheduler> getList();
- public List<CmsScheduler> getListBy(CmsScheduler bean);
- public CmsScheduler findById(Integer id);
- public CmsScheduler save(CmsScheduler bean);
- public CmsScheduler updateByUpdater(Updater<CmsScheduler> updater);
- public CmsScheduler deleteById(Integer id);
- }
- 计划管理DAO接口实现类 CmsSchedulerDaoImpl.java
- package com.jeecms.cms.dao.assist.impl;
- import java.util.List;
- import org.apache.commons.lang.StringUtils;
- import org.springframework.stereotype.Repository;
- import com.jeecms.cms.dao.assist.CmsSchedulerDao;
- import com.jeecms.cms.entity.assist.CmsScheduler;
- import com.jeecms.common.hibernate3.Finder;
- import com.jeecms.common.hibernate3.HibernateBaseDao;
- @Repository
- public class CmsSchedulerDaoImpl extends
- HibernateBaseDao<CmsScheduler, Integer> implements CmsSchedulerDao {
- @SuppressWarnings("unchecked")
- public List<CmsScheduler> getList() {
- Finder f = Finder.create("from CmsScheduler bean order by bean.id asc");
- return find(f);
- }
- @SuppressWarnings("unchecked")
- public List<CmsScheduler> getListBy(CmsScheduler bean) {
- Finder f = Finder.create("from CmsScheduler bean");
- if(StringUtils.isNotEmpty(bean.getModuleType()) && bean.getSite().getId() != null) {
- f.append(" where bean.moduleType=:moduleType and bean.site.id=:siteId");
- f.setParam("moduleType", bean.getModuleType());
- f.setParam("siteId", bean.getSite().getId());
- }
- f.append(" order by bean.id asc");
- return find(f);
- }
- public CmsScheduler findById(Integer id) {
- CmsScheduler entity = get(id);
- return entity;
- }
- public CmsScheduler save(CmsScheduler bean) {
- getSession().save(bean);
- return bean;
- }
- public CmsScheduler deleteById(Integer id) {
- CmsScheduler entity = super.get(id);
- if (entity != null) {
- getSession().delete(entity);
- }
- return entity;
- }
- @Override
- protected Class<CmsScheduler> getEntityClass() {
- return CmsScheduler.class;
- }
- }
- 计划任务管理服务接口 CmsSchedulerMng.java
- package com.jeecms.cms.manager.assist;
- import java.util.List;
- import com.jeecms.cms.entity.assist.CmsScheduler;
- /**
- * 计划任务管理服务接口
- * @author javacoo
- * @since 2011-11-07
- * @version 1.0
- */
- public interface CmsSchedulerMng {
- /**
- * 取得所有计划任务
- * @return 所有计划任务
- */
- List<CmsScheduler> getList();
- /**
- * 取得指定站点,指定模块所有计划任务
- * @param bean 计划任务bean
- * @return 所有计划任务
- */
- List<CmsScheduler> getListBy(CmsScheduler bean);
- /**
- * 根据ID取得计划任务
- * @param id
- * @return 计划任务
- */
- CmsScheduler findById(Integer id);
- /**
- * 停止指定的计划任务
- * @param id
- */
- void stop(Integer id);
- /**
- * 开始指定的计划任务
- * @param id
- */
- CmsScheduler start(Integer id);
- /**
- * 停止指定的计划任务
- * @param id
- */
- void end(Integer id);
- /**
- * 保存计划任务
- * @param bean
- * @return
- */
- CmsScheduler save(CmsScheduler bean);
- /**
- * 更新计划任务
- * @param bean
- * @return
- */
- CmsScheduler update(CmsScheduler bean);
- /**
- * 删除计划任务
- * @param bean
- * @return
- */
- CmsScheduler deleteById(Integer id);
- /**
- * 批量删除计划任务
- * @param bean
- * @return
- */
- CmsScheduler[] deleteByIds(Integer[] ids);
- }
- 计划任务管理服务接口实现类 CmsSchedulerMngImpl.java
- package com.jeecms.cms.manager.assist.impl;
- import java.util.Date;
- import java.util.List;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.stereotype.Service;
- import org.springframework.transaction.annotation.Transactional;
- import com.jeecms.cms.dao.assist.CmsSchedulerDao;
- import com.jeecms.cms.entity.assist.CmsAcquisition;
- import com.jeecms.cms.entity.assist.CmsScheduler;
- import com.jeecms.cms.manager.assist.CmsSchedulerMng;
- import com.jeecms.common.hibernate3.Updater;
- /**
- * 计划任务管理服务接口实现类
- * @author javacoo
- * @since 2011-11-07
- * @version 1.0
- */
- @Service
- @Transactional
- public class CmsSchedulerMngImpl implements CmsSchedulerMng{
- @Transactional(readOnly = true)
- public List<CmsScheduler> getList() {
- return dao.getList();
- }
- @Transactional(readOnly = true)
- public List<CmsScheduler> getListBy(CmsScheduler bean) {
- return dao.getListBy(bean);
- }
- @Transactional(readOnly = true)
- public CmsScheduler findById(Integer id) {
- CmsScheduler entity = dao.findById(id);
- return entity;
- }
- public void stop(Integer id) {
- CmsScheduler acqu = findById(id);
- if (acqu == null) {
- return;
- }
- if (acqu.getStatus() == CmsScheduler.START) {
- acqu.setStatus(CmsScheduler.STOP);
- }
- }
- public CmsScheduler start(Integer id) {
- CmsScheduler scheduler = findById(id);
- if (scheduler == null) {
- return scheduler;
- }
- scheduler.setStatus(CmsAcquisition.START);
- scheduler.setStartTime(new Date());
- scheduler.setEndTime(null);
- return scheduler;
- }
- public void end(Integer id) {
- CmsScheduler scheduler = findById(id);
- if (scheduler == null) {
- return;
- }
- scheduler.setStatus(CmsAcquisition.STOP);
- scheduler.setEndTime(new Date());
- }
- public CmsScheduler save(CmsScheduler bean) {
- bean.init();
- dao.save(bean);
- return bean;
- }
- public CmsScheduler update(CmsScheduler bean) {
- Updater<CmsScheduler> updater = new Updater<CmsScheduler>(bean);
- bean = dao.updateByUpdater(updater);
- return bean;
- }
- public CmsScheduler deleteById(Integer id) {
- CmsScheduler bean = dao.deleteById(id);
- return bean;
- }
- public CmsScheduler[] deleteByIds(Integer[] ids) {
- CmsScheduler[] beans = new CmsScheduler[ids.length];
- for (int i = 0, len = ids.length; i < len; i++) {
- beans[i] = deleteById(ids[i]);
- }
- return beans;
- }
- private CmsSchedulerDao dao;
- @Autowired
- public void setDao(CmsSchedulerDao dao) {
- this.dao = dao;
- }
- }
- 定时任务管理接口 SchedulerTaskManageSvc.java
- package com.jeecms.cms.service.scheduler;
- import java.util.List;
- import com.jeecms.cms.entity.assist.CmsScheduler;
- /**
- * 定时任务管理接口
- * @author javacoo
- * @since 2011-11-07
- */
- public interface SchedulerTaskManageSvc {
- /**
- * 开始计划任务
- * @param scheduler 任务对象
- * @return true/false
- */
- boolean start(CmsScheduler scheduler);
- /**
- * 结束计划任务
- * @param scheduler 任务对象
- * @return true/false
- */
- boolean stop(CmsScheduler scheduler);
- /**
- * 取得关联任务map
- * @param scheduler 任务对象
- * @return 关联任务map
- */
- List<SchedulerTaskBean> associateTaskList(CmsScheduler scheduler);
- }
- 定时任务管理接口实现类 SchedulerTaskManageSvcImpl.java
- package com.jeecms.cms.service.scheduler;
- import java.util.List;
- import java.util.Map;
- import java.util.concurrent.ConcurrentHashMap;
- import org.apache.commons.lang.StringUtils;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.stereotype.Service;
- import com.jeecms.cms.entity.assist.CmsScheduler;
- import com.jeecms.common.scheduling.core.Scheduler;
- import com.jeecms.common.scheduling.core.SchedulerTask;
- import com.jeecms.common.scheduling.impl.ScheduleParamBean;
- import com.jeecms.common.scheduling.impl.SimpleScheduleIterator;
- /**
- * 定时任务管理服务接口实现类
- * @author javacoo
- * @since 2011-11-07
- */
- @Service
- public class SchedulerTaskManageSvcImpl implements SchedulerTaskManageSvc {
- /**任务管理对象MAP*/
- private static Map<Integer,TaskManage> taskManageMap = new ConcurrentHashMap<Integer, TaskManage>();
- /**定时任务服务对象MAP*/
- @Autowired
- private Map<String,SchedulerTaskSvc> schedulerTaskSvcMap;
- /**
- * 任务管理对象
- * @author javacoo
- * @since 2011-11-07
- */
- private class TaskManage{
- /**任务调度*/
- private final Scheduler scheduler = new Scheduler();
- /**任务参数bean*/
- private ScheduleParamBean scheduleParamBean;
- /**定时任务*/
- private final SchedulerTaskSvc schedulerTaskSvc;
- private CmsScheduler cmsScheduler;
- public TaskManage(SchedulerTaskSvc schedulerSvc,CmsScheduler cmsScheduler){
- this.schedulerTaskSvc = schedulerSvc;
- this.cmsScheduler = cmsScheduler;
- }
- /**
- * 解析计划表达式
- * @return
- */
- private boolean parseSchedulerParam(){
- scheduleParamBean = new ScheduleParamBean();
- System.out.println("计划表达式:"+cmsScheduler.getExpression());
- String schedulerParamStr = cmsScheduler.getExpression();
- if(StringUtils.isNotEmpty(schedulerParamStr) && schedulerParamStr.contains(",")){
- String[] strAarr = schedulerParamStr.split(",");
- if(strAarr.length == 6){
- if(StringUtils.isNumeric(strAarr[0])){
- scheduleParamBean.setWeekOfMonth(Integer.valueOf(strAarr[0]));
- }
- if(StringUtils.isNumeric(strAarr[1])){
- scheduleParamBean.setDayOfWeek(Integer.valueOf(strAarr[1]));
- }
- if(StringUtils.isNumeric(strAarr[2])){
- scheduleParamBean.setDayOfMonth(Integer.valueOf(strAarr[2]));
- }
- if(StringUtils.isNumeric(strAarr[3])){
- scheduleParamBean.setHourOfDay(Integer.valueOf(strAarr[3]));
- }
- if(StringUtils.isNumeric(strAarr[4])){
- scheduleParamBean.setMinute(Integer.valueOf(strAarr[4]));
- }
- if(StringUtils.isNumeric(strAarr[5])){
- scheduleParamBean.setSecond(Integer.valueOf(strAarr[5]));
- }
- }else{
- return false;
- }
- }else{
- return false;
- }
- return true;
- }
- /**
- * 开始
- */
- public void start() {
- if(parseSchedulerParam()){
- scheduler.schedule(new SchedulerTask() {
- public void run() {
- processer();
- }
- private void processer() {
- System.out.println("============开始执行计划任务=================");
- schedulerTaskSvc.start(cmsScheduler);
- }
- }, new SimpleScheduleIterator(scheduleParamBean));
- }
- }
- /**
- * 取消
- */
- public void cancel() {
- schedulerTaskSvc.stop(cmsScheduler);
- scheduler.cancel();
- }
- }
- /**
- * 开始执行计划
- * @param scheduler 计划对象
- */
- public boolean start(CmsScheduler scheduler) {
- SchedulerTaskSvc schedulerSvc = getSchedulerTaskSvcByModuleType(scheduler.getModuleType());
- TaskManage taskManage = new TaskManage(schedulerSvc,scheduler);
- taskManage.start();
- taskManageMap.put(scheduler.getId(), taskManage);
- return true;
- }
- /**
- * 停止执行计划
- * @param scheduler 计划对象
- */
- public boolean stop(CmsScheduler scheduler) {
- TaskManage taskManage = taskManageMap.get(scheduler.getId());
- taskManage.cancel();
- return true;
- }
- /**
- * 取得计划关联的任务对象集合
- * @param scheduler 计划对象
- */
- public List<SchedulerTaskBean> associateTaskList(CmsScheduler scheduler) {
- SchedulerTaskSvc schedulerSvc = getSchedulerTaskSvcByModuleType(scheduler.getModuleType());
- return schedulerSvc.associateTaskList(scheduler);
- }
- /**
- * 根据模块的类型,取得定时任务服务对象
- * @param moduleType 模块类型
- */
- private SchedulerTaskSvc getSchedulerTaskSvcByModuleType(String moduleType){
- return schedulerTaskSvcMap.get(moduleType);
- }
- }
- 定时任务接口 SchedulerTaskSvc.java
- package com.jeecms.cms.service.scheduler;
- import java.util.List;
- import com.jeecms.cms.entity.assist.CmsScheduler;
- /**
- * 定时任务接口
- * @author javacoo
- * @since 2011-11-04
- */
- public interface SchedulerTaskSvc {
- /**
- * 开始计划任务
- * @param cmsScheduler 任务对象
- * @return true/false
- */
- boolean start(CmsScheduler cmsScheduler);
- /**
- * 结束计划任务
- * @param cmsScheduler 任务对象
- * @return true/false
- */
- boolean stop(CmsScheduler cmsScheduler);
- /**
- * 取得关联任务map
- * @param cmsScheduler 任务对象
- * @return 关联任务map
- */
- List<SchedulerTaskBean> associateTaskList(CmsScheduler cmsScheduler);
- }
- 定时任务抽象实现类 AbstractSchedulerTaskSvc.java
- package com.jeecms.cms.service.scheduler;
- import java.util.List;
- import com.jeecms.cms.entity.assist.CmsScheduler;
- /**
- * 定时任务抽象实现类
- * @author javacoo
- * @since 2011-11-08
- */
- public abstract class AbstractSchedulerTaskSvc implements SchedulerTaskSvc{
- /**
- * 开始计划任务
- * @return true/false
- */
- public boolean start(CmsScheduler scheduler){
- return execute(scheduler);
- }
- /**
- * 开始计划任务
- * @return true/false
- */
- public boolean stop(CmsScheduler scheduler){
- return true;
- }
- /**
- * 取得关联任务map
- * @return 关联任务map
- */
- public List<SchedulerTaskBean> associateTaskList(CmsScheduler scheduler){
- return null;
- }
- protected abstract boolean execute(CmsScheduler scheduler);
- }
- 定时任务接口-采集器实现类-多线程版 SchedulerAcquisitionSvcImpl.java
- package com.jeecms.cms.service.scheduler;
- import java.io.IOException;
- import java.net.URI;
- import java.net.URISyntaxException;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
- import java.util.concurrent.CountDownLatch;
- import java.util.concurrent.ExecutorService;
- import java.util.concurrent.Executors;
- import org.apache.commons.lang.StringUtils;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpHost;
- import org.apache.http.HttpResponse;
- import org.apache.http.StatusLine;
- import org.apache.http.client.ClientProtocolException;
- import org.apache.http.client.HttpClient;
- import org.apache.http.client.HttpResponseException;
- import org.apache.http.client.ResponseHandler;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.conn.params.ConnRoutePNames;
- import org.apache.http.impl.client.DefaultHttpClient;
- import org.apache.http.util.EntityUtils;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.stereotype.Service;
- import com.jeecms.cms.entity.assist.CmsAcquisition;
- import com.jeecms.cms.entity.main.Content;
- import com.jeecms.cms.manager.assist.CmsAcquisitionMng;
- import com.jeecms.common.crawler.UrlQueue;
- import com.jeecms.common.crawler.util.HtmlParserImpl;
- import com.jeecms.common.crawler.util.ParseHtmlTool;
- /**
- * 计划任务接口-采集器实现类-多线程版
- * @author javacoo
- * @since 2011-11-02
- * @version 1.0
- */
- @Service
- public class SchedulerAcquisitionSvcImpl extends AbstractSchedulerTaskSvc {
- private Logger log = LoggerFactory.getLogger(SchedulerAcquisitionSvcImpl.class);
- /**开启线程数*/
- private static int THREAD_NUM = 2;
- /**每个线程休眠毫秒数*/
- private static int SLEEP_TIME = 100;
- /**连接集合标志*/
- private static String LINK_KEY = "linkKey";
- /**标题集合标志*/
- private static String TITLE_KEY = "titleKey";
- /**采集管理对象*/
- private CmsAcquisitionMng cmsAcquisitionMng;
- /**存放HttpClient的ThreadLocal对象*/
- private static ThreadLocal<HttpClient> httpClientThreadLocal = new ThreadLocal<HttpClient>();
- /**存放ParseHtmlTool的ThreadLocal对象*/
- private static ThreadLocal<ParseHtmlTool> parseHtmlToolThreadLocal = new ThreadLocal<ParseHtmlTool>();
- /**存放UrlQueue的ThreadLocal对象*/
- private static ThreadLocal<UrlQueue> urlQueueThreadLocal = new ThreadLocal<UrlQueue>();
- /**存放计划UrlQueue的ThreadLocal对象*/
- private static ThreadLocal<UrlQueue> planUrlQueueThreadLocal = new ThreadLocal<UrlQueue>();
- @Autowired
- public void setCmsAcquisitionMng(CmsAcquisitionMng cmsAcquisitionMng) {
- this.cmsAcquisitionMng = cmsAcquisitionMng;
- }
- @Override
- protected boolean execute(CmsScheduler scheduler) {
- CmsAcquisition acqu = cmsAcquisitionMng.findById(scheduler.getAssociateId());
- if (acqu == null) {
- return false;
- }
- System.out.println("===============开始执行采集任务");
- new Thread(new MainThreadProcesser(this,acqu)).start();
- return true;
- }
- /**
- * 取得关联任务map
- * @return 关联任务map
- */
- public List<SchedulerTaskBean> associateTaskList(CmsScheduler scheduler){
- List<CmsAcquisition> list = cmsAcquisitionMng.getList(scheduler.getSite().getId());
- List<SchedulerTaskBean> resultList = new ArrayList<SchedulerTaskBean>();
- SchedulerTaskBean schedulerTaskBean = null;
- for(CmsAcquisition acquisition : list){
- schedulerTaskBean = new SchedulerTaskBean();
- schedulerTaskBean.setId(acquisition.getId());
- schedulerTaskBean.setName(acquisition.getName());
- resultList.add(schedulerTaskBean);
- }
- return resultList;
- }
- /**
- * 主线程处理类
- * @author javacoo
- * @since 2011-11-02
- */
- private class MainThreadProcesser implements Runnable {
- private CmsAcquisition acqu;
- private SchedulerTaskSvc schedulerAcquisitionSvc;
- public MainThreadProcesser(SchedulerTaskSvc schedulerAcquisitionSvc,CmsAcquisition acqu) {
- this.acqu = acqu;
- this.schedulerAcquisitionSvc = schedulerAcquisitionSvc;
- }
- //线程锁
- Object threadLock = new Object();
- public void run() {
- long tStart = System.currentTimeMillis();
- System.out.println("主线程:"+Thread.currentThread().getName() + "开始...");
- try {
- CountDownLatch latch = new CountDownLatch(THREAD_NUM);
- ExecutorService exec = Executors.newCachedThreadPool();
- getHttpClient().getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,new HttpHost("128.160.64.5", 1235));
- CharsetHandler handler = new CharsetHandler(acqu.getPageEncoding());
- //取得当前任务所有计划
- getAllPlans(acqu,getPlanUrlQueue());
- //开启一线程执行抓取计划下URL
- Thread thread = new Thread(new FetchUrlThread(schedulerAcquisitionSvc,latch,getHttpClient(),getPlanUrlQueue(),getUrlQueue(),getParseHtmlTool(acqu),handler,threadLock));
- exec.execute(thread);
- //开启指定数目线程执行采集内容
- for(int i=0;i<THREAD_NUM;i++){
- thread = new Thread(new FetchContentThread(schedulerAcquisitionSvc,acqu,latch,getHttpClient(),getUrlQueue(),getParseHtmlTool(acqu),handler,threadLock));
- exec.execute(thread);
- }
- latch.await();
- exec.shutdown();
- } catch (InterruptedException e) {
- e.printStackTrace();
- } finally{
- httpClientThreadLocal.get().getConnectionManager().shutdown();
- httpClientThreadLocal.remove();
- parseHtmlToolThreadLocal.remove();
- urlQueueThreadLocal.remove();
- planUrlQueueThreadLocal.remove();
- long tEnd = System.currentTimeMillis();
- System.out.println("主线程:"+Thread.currentThread().getName() + "结束...");
- System.out.println("主线程:"+Thread.currentThread().getName() + "总共用时:" + (tEnd - tStart) + "ms");
- }
- }
- }
- /**
- * 采集URL线程
- * @author javacoo
- * @since 2011-11-04
- */
- private class FetchUrlThread implements Runnable{
- private SchedulerTaskSvc acquisitionSvc;
- private CountDownLatch latch;
- private UrlQueue urlQueue;
- private UrlQueue planUrlQueue;
- private HttpClient httpClient;
- private ParseHtmlTool parseHtmlTool;
- private CharsetHandler handler;
- private Object threadLock;
- public FetchUrlThread(SchedulerTaskSvc acquisitionSvc,CountDownLatch latch,HttpClient httpClient,UrlQueue planUrlQueue,UrlQueue urlQueue,ParseHtmlTool parseHtmlTool,CharsetHandler handler,Object threadLock){
- this.acquisitionSvc = acquisitionSvc;
- this.latch = latch;
- this.urlQueue = urlQueue;
- this.planUrlQueue = planUrlQueue;
- this.httpClient = httpClient;
- this.parseHtmlTool = parseHtmlTool;
- this.handler = handler;
- this.threadLock = threadLock;
- }
- public void run() {
- System.out.println("======================采集URL子线程:"+Thread.currentThread().getName() + "开始...");
- try {
- Map<String,String> urlMap = null;
- while(!urlAndTitleMapIsEmpty(planUrlQueue)) {
- urlMap = getUrlAndTitleMap(planUrlQueue);
- getAllUrls(httpClient,parseHtmlTool,handler,urlQueue,urlMap);
- Thread.sleep(SLEEP_TIME);
- }
- } catch (ClientProtocolException e) {
- e.printStackTrace();
- } catch (URISyntaxException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } catch (InterruptedException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }finally {
- System.out.println("======================采集URL子线程:"+Thread.currentThread().getName() + "结束.");
- //通知采集内容线程开始执行
- synchronized(threadLock) {
- threadLock.notifyAll();
- }
- latch.countDown();
- }
- }
- }
- /**
- * 采集内容线程
- * @author javacoo
- * @since 2011-11-02
- */
- private class FetchContentThread implements Runnable {
- private SchedulerTaskSvc acquisitionSvc;
- private CmsAcquisition acqu;
- private CountDownLatch latch;
- private UrlQueue urlQueue;
- private HttpClient httpClient;
- private ParseHtmlTool parseHtmlTool;
- private CharsetHandler handler;
- private Object threadLock;
- public FetchContentThread(SchedulerTaskSvc acquisitionSvc,CmsAcquisition acqu,CountDownLatch latch,HttpClient httpClient,UrlQueue urlQueue,ParseHtmlTool parseHtmlTool,CharsetHandler handler,Object threadLock) {
- this.acquisitionSvc = acquisitionSvc;
- this.acqu = acqu;
- this.latch = latch;
- this.urlQueue = urlQueue;
- this.httpClient = httpClient;
- this.parseHtmlTool = parseHtmlTool;
- this.handler = handler;
- this.threadLock = threadLock;
- }
- public void run() {
- System.out.println("======================采集内容子线程:"+Thread.currentThread().getName() + "开始...");
- try {
- //等待采集URL线程执行完毕
- synchronized(threadLock) {
- threadLock.wait();
- }
- Map<String,String> urlMap = null;
- while(!urlAndTitleMapIsEmpty(urlQueue)) {
- urlMap = getUrlAndTitleMap(urlQueue);
- saveContent(acqu,httpClient,parseHtmlTool,handler,urlMap);
- Thread.sleep(SLEEP_TIME);
- }
- } catch (Exception e) {
- e.printStackTrace();
- log.warn(null, e);
- } finally {
- System.out.println("======================采集内容子线程:"+Thread.currentThread().getName() + "结束.");
- log.info("Acquisition#{} complete", acqu.getId());
- latch.countDown();
- }
- }
- }
- /**
- * 取得当前主线程的HttpClient对象
- * @return 当前主线程的HttpClient对象
- */
- private static HttpClient getHttpClient(){
- if(httpClientThreadLocal.get() == null){
- HttpClient client = new DefaultHttpClient();
- httpClientThreadLocal.set(client);
- return client;
- }else{
- return httpClientThreadLocal.get();
- }
- }
- /**
- * 取得当前主线程的UrlQueue对象
- * @return 当前主线程的UrlQueue对象
- */
- private static UrlQueue getUrlQueue(){
- if(urlQueueThreadLocal.get() == null){
- UrlQueue urlQueue = new UrlQueue();
- urlQueueThreadLocal.set(urlQueue);
- return urlQueue;
- }else{
- return urlQueueThreadLocal.get();
- }
- }
- /**
- * 取得当前主线程的计划UrlQueue对象
- * @return 当前主线程的计划UrlQueue对象
- */
- private static UrlQueue getPlanUrlQueue(){
- if(planUrlQueueThreadLocal.get() == null){
- UrlQueue urlQueue = new UrlQueue();
- planUrlQueueThreadLocal.set(urlQueue);
- return urlQueue;
- }else{
- return planUrlQueueThreadLocal.get();
- }
- }
- /**
- * 取得当前主线程的ParseHtmlTool对象
- * @param acqu 采集参数对象
- * @return 当前主线程的ParseHtmlTool对象
- */
- private static ParseHtmlTool getParseHtmlTool(CmsAcquisition acqu){
- if(parseHtmlToolThreadLocal.get() == null){
- ParseHtmlTool parseHtmlTool = new HtmlParserImpl(acqu);
- parseHtmlToolThreadLocal.set(parseHtmlTool);
- return parseHtmlTool;
- }else{
- return parseHtmlToolThreadLocal.get();
- }
- }
- /**
- * 连接和标题map对象入队列
- * @param map 连接和标题map对象
- */
- private synchronized void addUrlAndTitleMap(Map<String,String> map,UrlQueue urlQueue){
- System.out.println("====线程:"+Thread.currentThread().getName() + ",添加 urlQueue:"+urlQueue);
- urlQueue.addUnVisitedUrl(map);
- }
- /**
- * 连接和标题map对象出队列
- * @param urlQueue 当前线程的队列
- * @return 连接和标题map对象
- */
- private synchronized Map<String,String> getUrlAndTitleMap(UrlQueue urlQueue){
- System.out.println("====线程:"+Thread.currentThread().getName() + ",取得 urlQueue:"+urlQueue);
- return urlQueue.unVisitedUrlDeQueue();
- }
- /**
- * 判断当前对象是否为空
- * @param urlQueue 当前线程的队列
- * @return true/flase
- */
- private synchronized boolean urlAndTitleMapIsEmpty(UrlQueue urlQueue){
- System.out.println("====线程:"+Thread.currentThread().getName() + ",判断 urlQueue:"+urlQueue);
- return urlQueue.isEmpty();
- }
- /**
- * 取得当前线程下所有计划,并加入队列
- * @param acqu 采集参数对象
- * @param urlQueue 队列
- * @throws URISyntaxException
- * @throws IOException
- * @throws ClientProtocolException
- */
- private void getAllPlans(CmsAcquisition acqu,UrlQueue urlQueue){
- String[] plans = acqu.getAllPlans();
- Map<String,String> planMap = new HashMap<String,String>();
- for (int i = plans.length - 1; i >= 0; i--) {
- planMap.put(LINK_KEY, plans[i]);
- planMap.put(TITLE_KEY, acqu.getName());
- addUrlAndTitleMap(planMap,urlQueue);
- }
- System.out.println("=======当前线程:"+Thread.currentThread().getName() + "计划URL连接数:"+urlQueue.getUnVisitedUrlNum());
- }
- /**
- * 取得当前线程下所有计划的连接,并加入队列
- * @param acqu 采集参数对象
- * @param handler 字符集对象
- * @param urlQueue 队列
- * @throws URISyntaxException
- * @throws IOException
- * @throws ClientProtocolException
- */
- private void getAllUrls(HttpClient httpClient,ParseHtmlTool parseHtmlTool,CharsetHandler handler,UrlQueue urlQueue,Map<String,String> map) throws URISyntaxException, ClientProtocolException, IOException{
- HttpGet httpGet = new HttpGet(new URI(map.get(LINK_KEY).trim()));
- String html = httpClient.execute(httpGet, handler);
- for(Map<String,String> planMap : parseHtmlTool.getUrlAndTitleMap(html)){
- addUrlAndTitleMap(planMap,urlQueue);
- }
- System.out.println("=======当前线程:"+Thread.currentThread().getName() + "URL连接数:"+urlQueue.getUnVisitedUrlNum());
- }
- /**
- * 保存内容
- * @param acqu 请求参数对象
- * @param httpClient httpClient对象
- * @param parseHtmlTool parseHtmlTool对象
- * @param handler CharsetHandler对象
- * @param map 连接和标题map对象
- * @return Content
- */
- private synchronized Content saveContent(CmsAcquisition acqu,HttpClient httpClient,ParseHtmlTool parseHtmlTool,CharsetHandler handler,Map<String,String> map) {
- try {
- HttpGet httpGet = null;
- if(map.get(LINK_KEY).contains("http://")){
- httpGet = new HttpGet(new URI(map.get(LINK_KEY).trim()));
- }else{
- httpGet = new HttpGet(new URI("http://localhost/v7/"+map.get(LINK_KEY).trim()));
- }
- String html = httpClient.execute(httpGet, handler);
- System.out.println("=============================子线程:"+Thread.currentThread().getName() + "执行");
- String txt = parseHtmlTool.getHtml(html);
- //return cmsAcquisitionMng.saveContent(map.get(TITLE_KEY), txt,acqu.getId());
- return null;
- } catch (Exception e) {
- log.warn(null, e);
- e.printStackTrace();
- return null;
- }
- }
- /**
- * 字符集帮助类
- * @author Administrator
- *
- */
- private class CharsetHandler implements ResponseHandler<String> {
- private String charset;
- public CharsetHandler(String charset) {
- this.charset = charset;
- }
- public String handleResponse(HttpResponse response)
- throws ClientProtocolException, IOException {
- StatusLine statusLine = response.getStatusLine();
- if (statusLine.getStatusCode() >= 300) {
- throw new HttpResponseException(statusLine.getStatusCode(),
- statusLine.getReasonPhrase());
- }
- HttpEntity entity = response.getEntity();
- if (entity != null) {
- if (!StringUtils.isBlank(charset)) {
- return EntityUtils.toString(entity, charset);
- } else {
- return EntityUtils.toString(entity);
- }
- } else {
- return null;
- }
- }
- }
- }
- 定时服务关联任务bean SchedulerTaskBean.java
- package com.jeecms.cms.service.scheduler;
- /**
- * 定时服务关联任务bean
- * @author javacoo
- * @since 2011-11-07
- */
- public class SchedulerTaskBean {
- /**任务主键*/
- private Integer id;
- /**任务名称*/
- private String name;
- public Integer getId() {
- return id;
- }
- public void setId(Integer id) {
- this.id = id;
- }
- public String getName() {
- return name;
- }
- public void setName(String name) {
- this.name = name;
- }
- }
- 计划任务Controller CmsSchedulerAct.java
- package com.jeecms.cms.action.admin.assist;
- import java.util.List;
- import javax.servlet.http.HttpServletRequest;
- import javax.servlet.http.HttpServletResponse;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import org.springframework.beans.factory.annotation.Autowired;
- import org.springframework.stereotype.Controller;
- import org.springframework.ui.ModelMap;
- import org.springframework.web.bind.annotation.RequestMapping;
- import com.jeecms.cms.entity.assist.CmsAcquisition;
- import com.jeecms.cms.entity.assist.CmsScheduler;
- import com.jeecms.cms.entity.main.CmsSite;
- import com.jeecms.cms.manager.assist.CmsSchedulerMng;
- import com.jeecms.cms.manager.main.CmsLogMng;
- import com.jeecms.cms.service.scheduler.SchedulerTaskManageSvc;
- import com.jeecms.cms.service.scheduler.SchedulerTaskBean;
- import com.jeecms.cms.web.CmsUtils;
- import com.jeecms.cms.web.WebErrors;
- /**
- * 计划任务Controller
- * @author javacoo
- * @since 2011-11-7
- */
- @Controller
- public class CmsSchedulerAct {
- private static final Logger log = LoggerFactory
- .getLogger(CmsSchedulerAct.class);
- /**日志服务*/
- @Autowired
- private CmsLogMng cmsLogMng;
- /**计划管理服务*/
- @Autowired
- private CmsSchedulerMng manager;
- /**计划任务管理服务*/
- @Autowired
- private SchedulerTaskManageSvc schedulerTaskManageSvc;
- @RequestMapping("/scheduler/v_list.do")
- public String list(HttpServletRequest request, ModelMap model) {
- List<CmsScheduler> list = manager.getList();
- model.addAttribute("list", list);
- return "scheduler/list";
- }
- @RequestMapping("/scheduler/v_listBy.do")
- public String listBy(String moduleType,HttpServletRequest request, ModelMap model) {
- CmsSite site = CmsUtils.getSite(request);
- CmsScheduler scheduler = new CmsScheduler();
- scheduler.setModuleType(moduleType);
- scheduler.setSite(site);
- List<CmsScheduler> list = manager.getListBy(scheduler);
- model.addAttribute("list", list);
- model.addAttribute("moduleType", moduleType);
- return "scheduler/list";
- }
- @RequestMapping("/scheduler/v_add.do")
- public String add(String moduleType,HttpServletRequest request, ModelMap model) {
- CmsSite site = CmsUtils.getSite(request);
- CmsScheduler scheduler = new CmsScheduler();
- scheduler.setModuleType(moduleType);
- scheduler.setSite(site);
- List<SchedulerTaskBean> schedulerTaskList = schedulerTaskManageSvc.associateTaskList(scheduler);
- model.addAttribute("schedulerTaskList", schedulerTaskList);
- model.addAttribute("moduleType", moduleType);
- return "scheduler/add";
- }
- @RequestMapping("/scheduler/v_edit.do")
- public String edit(Integer id, HttpServletRequest request, ModelMap model) {
- WebErrors errors = validateEdit(id, request);
- if (errors.hasErrors()) {
- return errors.showErrorPage(model);
- }
- CmsSite site = CmsUtils.getSite(request);
- CmsScheduler scheduler = manager.findById(id);
- scheduler.setSite(site);
- List<SchedulerTaskBean> schedulerTaskList = schedulerTaskManageSvc.associateTaskList(scheduler);
- model.addAttribute("schedulerTaskList", schedulerTaskList);
- model.addAttribute("cmsScheduler", scheduler);
- return "scheduler/edit";
- }
- @RequestMapping("/scheduler/o_save.do")
- public String save(CmsScheduler bean,HttpServletRequest request, ModelMap model) {
- CmsSite site = CmsUtils.getSite(request);
- bean.setSite(site);
- bean = manager.save(bean);
- model.addAttribute("moduleType", bean.getModuleType());
- log.info("save CmsScheduler id={}", bean.getId());
- cmsLogMng.operating(request, "cmsAcquisition.log.save", "id="
- + bean.getId() + ";name=" + bean.getName());
- return "redirect:v_listBy.do";
- }
- @RequestMapping("/scheduler/o_update.do")
- public String update(CmsScheduler bean, HttpServletRequest request, ModelMap model) {
- WebErrors errors = validateUpdate(bean.getId(), request);
- if (errors.hasErrors()) {
- return errors.showErrorPage(model);
- }
- bean = manager.update(bean);
- log.info("update CmsAcquisition id={}.", bean.getId());
- cmsLogMng.operating(request, "cmsAcquisition.log.update", "id="
- + bean.getId() + ";name=" + bean.getName());
- return listBy(bean.getModuleType(),request, model);
- }
- @RequestMapping("/scheduler/o_delete.do")
- public String delete(String moduleType,Integer[] ids, HttpServletRequest request,
- ModelMap model) {
- WebErrors errors = validateDelete(ids, request);
- if (errors.hasErrors()) {
- return errors.showErrorPage(model);
- }
- CmsScheduler[] beans = manager.deleteByIds(ids);
- for (CmsScheduler bean : beans) {
- log.info("delete CmsAcquisition id={}", bean.getId());
- cmsLogMng.operating(request, "cmsScheduler.log.delete", "id="
- + bean.getId() + ";name=" + bean.getName());
- }
- return listBy(moduleType,request, model);
- }
- @RequestMapping("/scheduler/o_start.do")
- public String start(Integer id, HttpServletRequest request,
- HttpServletResponse response, ModelMap model) {
- CmsScheduler scheduler = manager.findById(id);
- schedulerTaskManageSvc.start(scheduler);
- manager.start(id);
- model.addAttribute("moduleType", scheduler.getModuleType());
- log.info("start CmsAcquisition id={}", id);
- return "redirect:v_listBy.do";
- }
- @RequestMapping("/scheduler/o_end.do")
- public String end(Integer id, HttpServletRequest request,
- HttpServletResponse response, ModelMap model) {
- manager.end(id);
- CmsScheduler scheduler = manager.findById(id);
- schedulerTaskManageSvc.stop(scheduler);
- model.addAttribute("moduleType", scheduler.getModuleType());
- log.info("end CmsScheduler id={}", id);
- return "redirect:v_listBy.do";
- }
- private WebErrors validateEdit(Integer id, HttpServletRequest request) {
- WebErrors errors = WebErrors.create(request);
- CmsSite site = CmsUtils.getSite(request);
- if (vldExist(id, site.getId(), errors)) {
- return errors;
- }
- return errors;
- }
- private WebErrors validateUpdate(Integer id, HttpServletRequest request) {
- WebErrors errors = WebErrors.create(request);
- CmsSite site = CmsUtils.getSite(request);
- if (vldExist(id, site.getId(), errors)) {
- return errors;
- }
- return errors;
- }
- private WebErrors validateDelete(Integer[] ids, HttpServletRequest request) {
- WebErrors errors = WebErrors.create(request);
- CmsSite site = CmsUtils.getSite(request);
- if (errors.ifEmpty(ids, "ids")) {
- return errors;
- }
- for (Integer id : ids) {
- vldExist(id, site.getId(), errors);
- }
- return errors;
- }
- private boolean vldExist(Integer id, Integer siteId, WebErrors errors) {
- if (errors.ifNull(id, "id")) {
- return true;
- }
- CmsScheduler entity = manager.findById(id);
- if (errors.ifNotExist(entity, CmsAcquisition.class, id)) {
- return true;
- }
- return false;
- }
- }
- 持久对象基类 BaseCmsScheduler.java
- package com.jeecms.cms.entity.assist.base;
- import java.io.Serializable;
- import java.util.Date;
- public abstract class BaseCmsScheduler implements Serializable {
- public static String REF = "CmsScheduler";
- public static String PROP_ID = "id";
- public static String PROP_SITE = "site";
- public static String PROP_ASSOCIATE_ID = "associateId";
- public static String PROP_MODULE_TYPE = "moduleType";
- public static String PROP_NAME = "name";
- public static String PROP_START_TIME = "startTime";
- public static String PROP_END_TIME = "endTime";
- public static String PROP_STATUS = "status";
- public static String PROP_EXPRESSION = "expression";
- // constructors
- public BaseCmsScheduler () {
- initialize();
- }
- /**
- * Constructor for primary key
- */
- public BaseCmsScheduler (java.lang.Integer id) {
- this.setId(id);
- initialize();
- }
- public BaseCmsScheduler(Integer id,String name, Date startTime, Date endTime,
- Integer status, Integer associateId, String moduleType, String expression,com.jeecms.cms.entity.main.CmsSite site) {
- super();
- this.id = id;
- this.name = name;
- this.startTime = startTime;
- this.endTime = endTime;
- this.status = status;
- this.associateId = associateId;
- this.moduleType = moduleType;
- this.expression = expression;
- this.site = site;
- }
- protected void initialize () {}
- private int hashCode = Integer.MIN_VALUE;
- // primary key
- private java.lang.Integer id;
- // fields
- private java.lang.String name;
- private java.util.Date startTime;
- private java.util.Date endTime;
- private java.lang.Integer status;
- private java.lang.Integer associateId;
- private java.lang.String moduleType;
- private java.lang.String expression;
- private com.jeecms.cms.entity.main.CmsSite site;
- public int getHashCode() {
- return hashCode;
- }
- public void setHashCode(int hashCode) {
- this.hashCode = hashCode;
- }
- public java.lang.Integer getId() {
- return id;
- }
- public void setId(java.lang.Integer id) {
- this.id = id;
- }
- public java.lang.String getName() {
- return name;
- }
- public void setName(java.lang.String name) {
- this.name = name;
- }
- public java.util.Date getStartTime() {
- return startTime;
- }
- public void setStartTime(java.util.Date startTime) {
- this.startTime = startTime;
- }
- public java.util.Date getEndTime() {
- return endTime;
- }
- public void setEndTime(java.util.Date endTime) {
- this.endTime = endTime;
- }
- public java.lang.Integer getStatus() {
- return status;
- }
- public void setStatus(java.lang.Integer status) {
- this.status = status;
- }
- public java.lang.Integer getAssociateId() {
- return associateId;
- }
- public void setAssociateId(java.lang.Integer associateId) {
- this.associateId = associateId;
- }
- public java.lang.String getModuleType() {
- return moduleType;
- }
- public void setModuleType(java.lang.String moduleType) {
- this.moduleType = moduleType;
- }
- public java.lang.String getExpression() {
- return expression;
- }
- public void setExpression(java.lang.String expression) {
- this.expression = expression;
- }
- public com.jeecms.cms.entity.main.CmsSite getSite() {
- return site;
- }
- public void setSite(com.jeecms.cms.entity.main.CmsSite site) {
- this.site = site;
- }
- }
- 持久对象 CmsScheduler.java
- package com.jeecms.cms.entity.assist;
- import java.util.Date;
- import com.jeecms.cms.entity.assist.base.BaseCmsScheduler;
- /**
- * 计划持久对象
- * @author javacoo
- * @since 2011-11-07
- */
- public class CmsScheduler extends BaseCmsScheduler {
- private static final long serialVersionUID = 1L;
- /**
- * 停止状态
- */
- public static final int STOP = 0;
- /**
- * 采集状态
- */
- public static final int START = 1;
- /**
- * 是否停止
- *
- * @return
- */
- public boolean isStop() {
- int status = getStatus();
- return status == 0;
- }
- public void init() {
- if (getStatus() == null) {
- setStatus(STOP);
- }
- }
- public CmsScheduler(){
- super();
- }
- public CmsScheduler(java.lang.Integer id){
- super(id);
- }
- public CmsScheduler(Integer id,String name, Date startTime, Date endTime,
- Integer status, Integer associateId, String moduleType, String expression,com.jeecms.cms.entity.main.CmsSite site) {
- super(id,name,startTime,endTime,status,associateId,moduleType,expression,site);
- }
- }
- HBM文件 CmsScheduler.hbm.xml
- <?xml version="1.0"?>
- <!DOCTYPE hibernate-mapping PUBLIC "-//Hibernate/Hibernate Mapping DTD//EN" "http://hibernate.sourceforge.net/hibernate-mapping-3.0.dtd">
- <hibernate-mapping package="com.jeecms.cms.entity.assist">
- <class name="CmsScheduler" table="jc_scheduler">
- <meta attribute="sync-DAO">false</meta>
- <id name="id" type="integer" column="scheduler_id"><generator class="identity"/></id>
- <property name="associateId" column="associate_id" type="integer" not-null="true" length="11"/>
- <property name="moduleType" column="module_type" type="string" not-null="false" length="100"/>
- <property name="name" column="name" type="string" not-null="false" length="100"/>
- <property name="startTime" column="start_time" type="timestamp" not-null="false" length="19"/>
- <property name="endTime" column="end_time" type="timestamp" not-null="false" length="19"/>
- <property name="status" column="status" type="integer" not-null="true" length="1"/>
- <property name="expression" column="expression" type="string" not-null="true" length="50"/>
- <many-to-one name="site" column="site_id" class="com.jeecms.cms.entity.main.CmsSite" not-null="true"></many-to-one>
- </class>
- </hibernate-mapping>
- ==============================定时任务模块相关互助类====================================
- 计划框架
- 计划框架-任务调度 Scheduler.java
- package com.jeecms.common.scheduling.core;
- import java.util.Date;
- import java.util.Timer;
- import java.util.TimerTask;
- /**
- * 计划框架-任务调度
- * <li>
- * 用于提供必要的计划,Scheduler 的每一个实例都拥有 Timer 的一个实例,用于提供底层计划
- * 它将一组单次定时器串接在一起,以便在由 ScheduleIterator 指定的各个时间执行 SchedulerTask 类
- * </li>
- * @author javacoo
- * @since 2011-11-02
- */
- public class Scheduler {
- /**Timer实例*/
- private final Timer timer = new Timer();
- /**
- * 定时任务计划
- * @author javacoo
- * @since 2011-11-02
- */
- class SchedulerTimerTask extends TimerTask {
- private SchedulerTask schedulerTask;
- private ScheduleIterator iterator;
- public SchedulerTimerTask(SchedulerTask schedulerTask,
- ScheduleIterator iterator) {
- this.schedulerTask = schedulerTask;
- this.iterator = iterator;
- }
- public void run() {
- schedulerTask.run();
- reschedule(schedulerTask, iterator);
- }
- }
- public Scheduler() {
- }
- /**
- * 取消执行
- */
- public void cancel() {
- timer.cancel();
- }
- /**
- * 计划的入口点
- * <li>
- * 通过调用 ScheduleIterator 接口的 next(),发现第一次执行 SchedulerTask 的时间。
- * 然后通过调用底层 Timer 类的单次 schedule() 方法,启动计划在这一时刻执行。
- * 为单次执行提供的 TimerTask 对象是嵌入的 SchedulerTimerTask 类的一个实例,
- * 它包装了任务和迭代器(iterator)。在指定的时间,调用嵌入类的 run() 方法,
- * 它使用包装的任务和迭代器引用以便重新计划任务的下一次执行
- * </li>
- * @param schedulerTask SchedulerTimerTask 类的一个实例
- * @param iterator ScheduleIterator 接口的一个实例
- */
- public void schedule(SchedulerTask schedulerTask, ScheduleIterator iterator) {
- Date time = iterator.next();
- if (time == null) {
- schedulerTask.cancel();
- } else {
- synchronized (schedulerTask.lock) {
- if (schedulerTask.state != SchedulerTask.VIRGIN) {
- throw new IllegalStateException("任务已经执行/取消");
- }
- schedulerTask.state = SchedulerTask.SCHEDULED;
- schedulerTask.timerTask = new SchedulerTimerTask(schedulerTask,iterator);
- timer.schedule(schedulerTask.timerTask, time);
- }
- }
- }
- /**
- * 重新制定计划
- * @param schedulerTask SchedulerTimerTask 类的一个实例
- * @param iterator ScheduleIterator 接口的一个实例
- */
- private void reschedule(SchedulerTask schedulerTask,
- ScheduleIterator iterator) {
- Date time = iterator.next();
- if (time == null) {
- schedulerTask.cancel();
- } else {
- synchronized (schedulerTask.lock) {
- if (schedulerTask.state != SchedulerTask.CANCELLED) {
- schedulerTask.timerTask = new SchedulerTimerTask(
- schedulerTask, iterator);
- timer.schedule(schedulerTask.timerTask, time);
- }
- }
- }
- }
- }
- 计划框架-时间生成器接口 ScheduleIterator.java
- package com.jeecms.common.scheduling.core;
- import java.util.Date;
- /**
- * 计划框架-时间生成器接口
- * <li>将 SchedulerTask 的计划执行时间指定为一系列 java.util.Date 对象的接口
- * 然后 next() 方法按时间先后顺序迭代 Date 对象,返回值 null 会使任务取消(即它再也不会运行)</li>
- * @author javacoo
- * @since 2011-11-02
- */
- public interface ScheduleIterator {
- /**
- * 返回下次计划执行时间
- * @return 下次计划执行时间
- */
- Date next();
- }
- 计划任务抽象类 SchedulerTask.java
- package com.jeecms.common.scheduling.core;
- import java.util.TimerTask;
- /**
- * 计划任务抽象类
- * <li>
- * SchedulerTask 在其生命周期中要经历一系列的状态。创建后,它处于 VIRGIN 状态,
- * 这表明它从没有计划过。计划以后,它就变为 SCHEDULED 状态,
- * 再用下面描述的方法之一取消任务后,它就变为 CANCELLED 状态。
- * 管理正确的状态转变 —— 如保证不对一个非 VIRGIN 状态的任务进行两次计划 ——
- * 增加了 Scheduler 和 SchedulerTask 类的复杂性。在进行可能改变任务状态的操作时,
- * 代码必须同步任务的锁对象
- * </li>
- * @author javacoo
- * @since 2011-11-02
- */
- public abstract class SchedulerTask implements Runnable {
- /**同步任务的锁对象*/
- final Object lock = new Object();
- /**状态*/
- int state = VIRGIN;
- /**初始状态*/
- static final int VIRGIN = 0;
- /**任务状态*/
- static final int SCHEDULED = 1;
- /**取消状态*/
- static final int CANCELLED = 2;
- /**TimerTask 对象*/
- TimerTask timerTask;
- protected SchedulerTask() {
- }
- /**执行的任务,由子类实现*/
- public abstract void run();
- /**取消任务
- * <li>
- * 任务再也不会运行了,不过已经运行的任务仍会运行完成
- * </li>
- */
- public boolean cancel() {
- synchronized (lock) {
- if (timerTask != null) {
- timerTask.cancel();
- }
- boolean result = (state == SCHEDULED);
- state = CANCELLED;
- return result;
- }
- }
- public long scheduledExecutionTime() {
- synchronized (lock) {
- return timerTask == null ? 0 : timerTask.scheduledExecutionTime();
- }
- }
- }
- 计划框架-时间生成器接口实现类 SimpleScheduleIterator.java
- package com.jeecms.common.scheduling.impl;
- import java.util.Calendar;
- import java.util.Date;
- import java.util.GregorianCalendar;
- import com.jeecms.common.scheduling.core.ScheduleIterator;
- /**
- * 计划框架-时间生成器接口实现类
- * <li>返回 月/周/天/小时/分钟/秒 计划的下一次执行时间</li>
- * <li>约定:参数以逗号分隔,*号表示无值</li>
- * <li>参数解释:
- * <br>第一位:每个月的第几周</br>
- * <br>第二位:每周的第几天</br>
- * <br>第三位:天(几号)</br>
- * <br>第四位:小时(24小时制)</br>
- * <br>第五位:分钟</br>
- * <br>第六位:秒</br>
- * </li>
- * <li>参数样例:
- * <br> 1,6,4,15,20,30 表示 从今天的15:20:30开始,每隔一个月执行一次,即下次执行时间是 下个月的第一周的第6天的15:20:30</br>
- * <br> *,6,4,15,20,30 表示 从今天的15:20:30开始,每隔一周执行一次,即下次执行时间是 下一周的第6天的15:20:30</br>
- * <br> *,*,4,15,20,30 表示 从今天的15:20:30开始,每隔一天执行一次,即下次执行时间是 下一天的15:20:30</br>
- * <br> *,*,*,15,20,30 表示 从今天的15:20:30开始,每隔一小时执行一次,即下次执行时间是 16:20:30</br>
- * <br> *,*,*,*,20,30 表示 从这个小时的20:30开始,每隔一分钟执行一次,即下次执行时间是 *:21:30</br>
- * <br> *,*,*,*,*,30 表示 从当前时间的30秒开始,每隔一秒执行一次,即下次执行时间是 *:*:31</br>
- * </li>
- * @author javacoo
- * @since 2011-11-03
- */
- public class SimpleScheduleIterator implements ScheduleIterator {
- private final ScheduleParamBean scheduleParamBean;
- private final Calendar calendar = Calendar.getInstance();
- private final Calendar orginCalendar = Calendar.getInstance();
- public SimpleScheduleIterator(final ScheduleParamBean scheduleParamBean) {
- this(scheduleParamBean, new Date());
- }
- public SimpleScheduleIterator(final ScheduleParamBean scheduleParamBean, Date date) {
- this.scheduleParamBean = scheduleParamBean;
- orginCalendar.setTime(date);
- calendar.setTime(date);
- if(null != scheduleParamBean.getWeekOfMonth()){
- calendar.set(Calendar.WEEK_OF_MONTH, scheduleParamBean.getWeekOfMonth());
- }
- //如果设置了每周的第几天和一个月的第几天,则忽略一个月的第几天
- if(null != scheduleParamBean.getDayOfWeek()){
- calendar.set(Calendar.DAY_OF_WEEK, scheduleParamBean.getDayOfWeek());
- }else if(null != scheduleParamBean.getDayOfMonth()){
- calendar.set(Calendar.DAY_OF_MONTH, scheduleParamBean.getDayOfMonth());
- }
- if(null != scheduleParamBean.getHourOfDay()){
- calendar.set(Calendar.HOUR_OF_DAY, scheduleParamBean.getHourOfDay());
- }
- if(null != scheduleParamBean.getMinute()){
- calendar.set(Calendar.MINUTE, scheduleParamBean.getMinute());
- }
- if(null != scheduleParamBean.getSecond()){
- calendar.set(Calendar.SECOND, scheduleParamBean.getSecond());
- }
- calendar.set(Calendar.MILLISECOND, 0);
- //如果设置时间 大于当前时间
- if (!calendar.getTime().before(date)) {
- System.out.println(calendar.getTime() +"大于当前时间:"+date);
- if(null != scheduleParamBean.getWeekOfMonth()){
- calendar.add(Calendar.MONTH, -1);
- }else if(null != scheduleParamBean.getDayOfWeek()){
- calendar.add(Calendar.DAY_OF_WEEK, -6);
- }else if(null != scheduleParamBean.getDayOfMonth()){
- calendar.add(Calendar.DAY_OF_MONTH, -1);
- }else if(null != scheduleParamBean.getHourOfDay()){
- calendar.add(Calendar.HOUR_OF_DAY, -1);
- }else if(null != scheduleParamBean.getMinute()){
- calendar.add(Calendar.MINUTE, -1);
- }else if(null != scheduleParamBean.getSecond()){
- calendar.add(Calendar.SECOND, -1);
- }
- }else{//如果小于,则会一下执行多次,所以在天,小时,分钟,秒 都加上相应时间差
- System.out.println(calendar.getTime() +"小于当前时间:"+date);
- if(null != scheduleParamBean.getDayOfMonth()){
- calendar.add(Calendar.DAY_OF_MONTH, orginCalendar.get(Calendar.DAY_OF_MONTH) - scheduleParamBean.getDayOfMonth());
- }else if(null != scheduleParamBean.getHourOfDay()){
- calendar.add(Calendar.HOUR_OF_DAY, orginCalendar.get(Calendar.HOUR_OF_DAY) - scheduleParamBean.getHourOfDay());
- }else if(null != scheduleParamBean.getMinute()){
- calendar.add(Calendar.MINUTE, orginCalendar.get(Calendar.MINUTE) - scheduleParamBean.getMinute());
- }else if(null != scheduleParamBean.getSecond()){
- calendar.add(Calendar.SECOND, orginCalendar.get(Calendar.SECOND) - scheduleParamBean.getSecond());
- }
- }
- }
- public Date next() {
- if(null != scheduleParamBean.getWeekOfMonth()){
- calendar.add(Calendar.MONTH, 1);
- }else if(null != scheduleParamBean.getDayOfWeek()){
- calendar.add(Calendar.DAY_OF_WEEK, 6);
- }else if(null != scheduleParamBean.getDayOfMonth()){
- calendar.add(Calendar.DAY_OF_MONTH, 1);
- }else if(null != scheduleParamBean.getHourOfDay()){
- calendar.add(Calendar.HOUR_OF_DAY, 1);
- }else if(null != scheduleParamBean.getMinute()){
- calendar.add(Calendar.MINUTE, 1);
- }else if(null != scheduleParamBean.getSecond()){
- calendar.add(Calendar.SECOND, 1);
- }
- System.out.println("下次执行时间:"+calendar.getTime());
- return calendar.getTime();
- }
- }
- 时间计划参数bean ScheduleParamBean.java
- package com.jeecms.common.scheduling.impl;
- /**
- * 时间计划参数bean
- * @author javacoo
- * @since 2011-11-04
- */
- public class ScheduleParamBean {
- /**每个月的第几周,每周的第几天,每个月的第几天,小时(24小时制),分钟,秒*/
- private Integer weekOfMonth,dayOfWeek,dayOfMonth,hourOfDay, minute, second;
- public ScheduleParamBean(){
- }
- public ScheduleParamBean(Integer weekOfMonth, Integer dayOfWeek,
- Integer dayOfMonth, Integer hourOfDay, Integer minute,
- Integer second) {
- super();
- this.weekOfMonth = weekOfMonth;
- this.dayOfWeek = dayOfWeek;
- this.dayOfMonth = dayOfMonth;
- this.hourOfDay = hourOfDay;
- this.minute = minute;
- this.second = second;
- }
- public Integer getWeekOfMonth() {
- return weekOfMonth;
- }
- public void setWeekOfMonth(Integer weekOfMonth) {
- this.weekOfMonth = weekOfMonth;
- }
- public Integer getDayOfWeek() {
- return dayOfWeek;
- }
- public void setDayOfWeek(Integer dayOfWeek) {
- this.dayOfWeek = dayOfWeek;
- }
- public Integer getDayOfMonth() {
- return dayOfMonth;
- }
- public void setDayOfMonth(Integer dayOfMonth) {
- this.dayOfMonth = dayOfMonth;
- }
- public Integer getHourOfDay() {
- return hourOfDay;
- }
- public void setHourOfDay(Integer hourOfDay) {
- this.hourOfDay = hourOfDay;
- }
- public Integer getMinute() {
- return minute;
- }
- public void setMinute(Integer minute) {
- this.minute = minute;
- }
- public Integer getSecond() {
- return second;
- }
- public void setSecond(Integer second) {
- this.second = second;
- }
- @Override
- public String toString() {
- return "ScheduleParamBean [dayOfMonth=" + dayOfMonth + ", dayOfWeek="
- + dayOfWeek + ", hourOfDay=" + hourOfDay + ", minute=" + minute
- + ", second=" + second + ", weekOfMonth=" + weekOfMonth + "]";
- }
- }
- 采集相关
- HTML解析工具类接口 ParseHtmlTool.java
- package com.jeecms.common.crawler.util;
- import java.util.List;
- import java.util.Map;
- /**
- * HTML解析工具类接口
- * @author javacoo
- * @since 2011-10-31
- */
- public interface ParseHtmlTool {
- /**
- * 取得连接集合
- * @param orginHtml 原始HTML
- * @return 连接集合
- */
- List<String> getUrlList( String orginHtml);
- /**
- * 取得标题集合
- * @param orginHtml 原始HTML
- * @return 标题集合
- */
- List<String> getTitleList(String orginHtml);
- /**
- * 取得指定区域的HTML内容
- * @return 指定区域的HTML内容
- */
- String getHtml(String orginHtml);
- /**
- * 取得连接标题Map集合
- * @param orginHtml 原始HTML
- * @return 连接标题Map集合
- */
- List<Map<String,String>> getUrlAndTitleMap(String orginHtml);
- }
- HTML解析工具,HtmlParser实现类 HtmlParserImpl.java
- package com.jeecms.common.crawler.util;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.io.InputStreamReader;
- import java.net.URISyntaxException;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.Iterator;
- import java.util.List;
- import java.util.Map;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.commons.lang.StringUtils;
- import org.htmlparser.Node;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.HasAttributeFilter;
- import org.htmlparser.filters.NodeClassFilter;
- import org.htmlparser.filters.TagNameFilter;
- import org.htmlparser.nodes.RemarkNode;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- import com.jeecms.cms.entity.assist.CmsAcquisition;
- import com.jeecms.common.crawler.ParamBean;
- /**
- * HTML解析工具,HtmlParser实现类
- * @author javacoo
- * @since 2011-10-31
- */
- public class HtmlParserImpl implements ParseHtmlTool{
- /**连接集合标志*/
- private static String LINK_KEY = "linkKey";
- /**标题集合标志*/
- private static String TITLE_KEY = "titleKey";
- /**单标签标志*/
- private static String SINGLE_TAG = "singleTag";
- /**连接正则表达式*/
- private static String LINK_REGX = "<a.*href=\"(.*?)\".*>(.*?)</a>";
- /**正则表达式对象*/
- private Pattern pt = Pattern.compile(LINK_REGX);
- /**采集参数bean*/
- private ParamBean paramBean;
- public HtmlParserImpl(CmsAcquisition acqu){
- parseRequestParam(acqu);
- }
- /**
- * 取得标题集合
- * @param orginHtml 原始HTML
- * @return 标题集合
- */
- public List<String> getTitleList(String orginHtml) {
- orginHtml = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),orginHtml);
- if (StringUtils.isNotEmpty(orginHtml)) {
- return getUrlOrTitleListByType(orginHtml,TITLE_KEY);
- }
- return null;
- }
- /**
- * 取得连接集合
- * @param orginHtml 原始HTML
- * @return 连接集合
- */
- public List<String> getUrlList(String orginHtml) {
- orginHtml = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),orginHtml);
- if (StringUtils.isNotEmpty(orginHtml)) {
- return getUrlOrTitleListByType(orginHtml,LINK_KEY);
- }
- return null;
- }
- /**
- * 取得指定区域的HTML内容
- * @param orginHtml 原始HTML
- * @return 指定区域的HTML内容
- * @throws ParserException
- */
- public String getHtml(String orginHtml) {
- orginHtml = getHtmlByFilter(paramBean.getContentStartMap(), paramBean.getContentEndMap(),orginHtml);
- return orginHtml;
- }
- /**
- * 取得连接标题Map
- * @param orginHtml 原始HTML
- * @return 连接标题Map
- */
- public List<Map<String,String>> getUrlAndTitleMap(String orginHtml){
- return getUrlAandTitleMap(orginHtml);
- }
- /**
- * 解析采集参数,并封装到ParamBean
- * @param acqu 原始采集参数
- * @return 采集参数封装bean
- */
- private void parseRequestParam(CmsAcquisition acqu){
- paramBean = new ParamBean();
- if(!StringUtils.isEmpty(acqu.getLinksetStart())){
- paramBean.setLinksetStartMap(populateParamMap(acqu.getLinksetStart()));
- }
- if(!StringUtils.isEmpty(acqu.getLinksetEnd())){
- paramBean.setLinksetEndMap(populateParamMap(acqu.getLinksetEnd()));
- }
- if(!StringUtils.isEmpty(acqu.getContentStart())){
- paramBean.setContentStartMap(populateParamMap(acqu.getContentStart()));
- }
- if(!StringUtils.isEmpty(acqu.getContentEnd())){
- paramBean.setContentEndMap(populateParamMap(acqu.getContentEnd()));
- }
- }
- /**
- * 得到连接标题MAP
- * @param html html内容
- * @return 连接或者标题集合
- */
- private List<Map<String,String>> getUrlAandTitleMap(String html) {
- html = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),html);
- List<Map<String,String>> resultMapList = new ArrayList<Map<String,String>>();
- Map<String,String> resultMap = null;
- Matcher m = pt.matcher(html);
- while (m.find()) {
- if(StringUtils.isNotEmpty(m.group(1)) && StringUtils.isNotEmpty(m.group(2))){
- resultMap = new HashMap<String, String>();
- resultMap.put(LINK_KEY, m.group(1));
- resultMap.put(TITLE_KEY, m.group(2));
- resultMapList.add(resultMap);
- }
- }
- return resultMapList;
- }
- /**
- * 得到地址集
- * @param html html内容
- * @param type 1 :取得连接集合,2:取得标题集合
- * @return 连接或者标题集合
- */
- private List<String> getUrlOrTitleListByType(String html, String type) {
- List<String> resultList = new ArrayList<String>();
- Matcher m = pt.matcher(html);
- String result = "";
- int pos = 1;
- if(TITLE_KEY.equals(type)){
- pos = 2;
- }
- while (m.find()) {
- result = m.group(pos);
- resultList.add(result);
- }
- return resultList;
- }
- /**
- * 取得指定区域的HTML内容
- * @param tagMap 标签MAP
- * @param removeTagMap 要过滤的标签MAP
- * @param orginHtml 原始HTML
- * @return 指定区域的HTML内容
- * @throws ParserException
- */
- private String getHtmlByFilter(Map<String, String> tagMap,
- Map<String, String> removeTagMap, String orginHtml) {
- try {
- Parser parser = new Parser();
- parser.setInputHTML(orginHtml);
- // 第一步取得指定属性/标签内容
- String tempKey = null;
- String tempValue = null;
- String[] tempValueArr = null;
- StringBuilder sb = new StringBuilder();
- NodeFilter filter = null;
- for(Iterator<String> it = tagMap.keySet().iterator(); it.hasNext();){
- tempKey = it.next();
- tempValue = tagMap.get(tempKey);
- if(tempValue.contains("|")){
- tempValueArr = tempValue.split("\\|");
- }else{
- tempValueArr = new String[]{tempValue};
- }
- for(String value : tempValueArr){
- filter = populateFilter(tempKey,value);
- appendHtmlByFilter(parser, filter, sb);
- }
- }
- // 第二步过滤指定属性/标签内容
- String contentHtml = sb.toString();
- for (Iterator<String> it = removeTagMap.keySet().iterator(); it
- .hasNext();) {
- tempKey = it.next();
- tempValue = removeTagMap.get(tempKey);
- if(tempValue.contains("|")){
- tempValueArr = tempValue.split("\\|");
- }else{
- tempValueArr = new String[]{tempValue};
- }
- for(String value : tempValueArr){
- filter = populateFilter(tempKey,value);
- contentHtml = removeHtmlByFilter(parser, filter, contentHtml);
- }
- }
- //第三步过滤注释
- filter = new NodeClassFilter(RemarkNode.class);
- contentHtml = removeHtmlByFilter(parser, filter, contentHtml);
- System.out.println("=================================结果=======================================");
- System.out.println(contentHtml);
- return contentHtml;
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return "";
- }
- /**
- * 解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用
- * <li>约定采集参数格式如下</li>
- * <li>1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN</li>
- * <li>2,标签名称形式,如:div,p,span</li>
- * <li>3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span</li>
- * @param paramStr 参数字符串
- */
- private Map<String, String> populateParamMap(String paramStr) {
- Map<String, String> paramMap = new HashMap<String, String>();
- String[] paramStrArr = paramStr.split(",");
- String[] tempStrArr = null;
- StringBuilder sb = new StringBuilder();
- for(String temp : paramStrArr){
- if(temp.contains("=")){
- tempStrArr = temp.split("=");
- paramMap.put(tempStrArr[0], tempStrArr[1]);
- }else{
- if(StringUtils.isNotEmpty(temp)){
- sb.append(temp).append("|");
- }
- }
- }
- if(StringUtils.isNotEmpty(sb.toString())){
- paramMap.put(SINGLE_TAG, sb.substring(0, sb.length() - 1));
- }
- return paramMap;
- }
- /**
- * 组装过滤器
- * @param key 键
- * @param value 值
- * @return 过滤器
- */
- private NodeFilter populateFilter(String key,String value) {
- NodeFilter filter;
- if(SINGLE_TAG.equals(key)){
- filter = new TagNameFilter(value);
- }else{
- filter = new HasAttributeFilter(key,value);
- }
- return filter;
- }
- /**
- * 过滤指定属性标签HTML
- * @param parser 解析器
- * @param filter 属性过滤器
- * @param orginHtml 原始HTML
- * @return 过滤后HTML
- * @throws ParserException
- */
- private String removeHtmlByFilter(Parser parser, NodeFilter filter,String orginHtml) throws ParserException {
- parser.setInputHTML(orginHtml);
- NodeList nodes = parser.extractAllNodesThatMatch(filter);
- for (int i = 0; i < nodes.size(); i++) {
- Node textnode = (Node) nodes.elementAt(i);
- orginHtml = StringUtils.remove(orginHtml, textnode.toHtml());
- }
- return orginHtml;
- }
- /**
- * 取得所有指定属性/标签的HTML
- * @param parser 解析器
- * @param filter 过滤器
- * @param sb
- * @throws ParserException
- */
- private void appendHtmlByFilter(Parser parser, NodeFilter filter,
- StringBuilder sb) throws ParserException {
- NodeList nodes = parser.extractAllNodesThatMatch(filter);
- for (int i = 0; i < nodes.size(); i++) {
- Node textnode = (Node) nodes.elementAt(i);
- sb.append(textnode.toHtml());
- }
- }
- /**
- * 解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用
- * <li>约定采集参数格式如下</li>
- * <li>1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN</li>
- * <li>2,标签名称形式,如:div,p,span</li>
- * <li>3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span</li>
- * @param paramMap 参数map
- * @param str 参数字符串
- */
- private void populateParamMap(Map<String, String> paramMap,String paramStr) {
- String[] paramStrArr = paramStr.split(",");
- String[] tempStrArr = null;
- StringBuilder sb = new StringBuilder();
- for(String temp : paramStrArr){
- if(temp.contains("=")){
- tempStrArr = temp.split("=");
- paramMap.put(tempStrArr[0], tempStrArr[1]);
- }else{
- if(StringUtils.isNotEmpty(temp)){
- sb.append(temp).append("|");
- }
- }
- }
- if(StringUtils.isNotEmpty(sb.toString())){
- paramMap.put(SINGLE_TAG, sb.substring(0, sb.length() - 1));
- }
- }
- /**
- * 测试方法-打开文件并返回内容
- * @param szFileName 文件绝对地址
- * @param charset 字符集
- * @return 内容
- */
- public static String openFile(String szFileName,String charset) {
- try {
- BufferedReader bis = new BufferedReader(new InputStreamReader(
- new FileInputStream(new File(szFileName)), charset));
- StringBuilder szContent = new StringBuilder();
- String szTemp;
- while ((szTemp = bis.readLine()) != null) {
- szContent.append(szTemp).append("\n");
- }
- bis.close();
- return szContent.toString();
- } catch (Exception e) {
- return "";
- }
- }
- /**
- * 测试取得连接地址和标题
- * @throws ParserException
- */
- public void testFetchLinkAndTitle() throws ParserException{
- String html = openFile("F:\\4.htm","UTF-8");
- String result = "";
- Map<String, String> map = new HashMap<String, String>();
- map.put("class", "m_list");
- Map<String, String> notMap = new HashMap<String, String>();
- //notMap.put("class", "atc_ic_f");
- result = getHtmlByFilter(map,notMap,html);
- System.out.println("=============================result============================");
- System.out.println(result);
- System.out.println("==========================================================");
- Pattern pt = Pattern.compile("<a.*href=\"(.*?)\".*>(.*?)</a>");
- Matcher m = pt.matcher(result);
- String link = null;
- String title = null;
- while (m.find()) {
- link = m.group(1);
- title = m.group(2);
- if (StringUtils.isNotEmpty(link)) {
- System.out.println("url : " + link);
- System.out.println("title : " + title);
- }
- }
- }
- /**
- * 测试取得内容
- * @throws ParserException
- */
- public void testFetchContent() throws ParserException{
- String html = openFile("F:\\6.shtml","GB2312");
- Map<String, String> map = new HashMap<String, String>();
- map.put("id", "artibody");
- Map<String, String> notMap = new HashMap<String, String>();
- notMap.put(SINGLE_TAG, "style|script");
- notMap.put("type", "text/javascript");
- notMap.put("class", "icon_fx|blkComment otherContent_01");
- notMap.put("style", "text-align: right;padding-right:10px;|margin-top:6px;|font-size: 12px ! important;|font-size:12px");
- notMap.put("id", "fxwb|fxMSN|fxMSN|comment_t_show_top");
- getHtmlByFilter(map,notMap,html);
- }
- /**
- * 测试解析参数
- */
- public void testParseParam(){
- Map<String, String> map = new HashMap<String, String>();
- populateParamMap(map,"class=articleList|tips,p,div");
- String tempKey = null;
- String tempValue = null;
- String[] tempValueArr = null;
- for (Iterator<String> it = map.keySet().iterator(); it.hasNext();) {
- tempKey = it.next();
- tempValue = map.get(tempKey);
- if(tempValue.contains("|")){
- tempValueArr = tempValue.split("\\|");
- }else{
- tempValueArr = new String[]{tempValue};
- }
- for(String value : tempValueArr){
- System.out.println("tempKey:" + tempKey);
- System.out.println("value:" + value);
- }
- }
- }
- /**
- * 测试过滤标签
- * @throws ParserException
- */
- public void testRemarkFilter() throws ParserException{
- String html = openFile("F:\\6.shtml","GB2312");
- System.out.println("=========================过滤注释前HTML==================================");
- System.out.println(html);
- NodeFilter filter = new NodeClassFilter(RemarkNode.class);
- html = removeHtmlByFilter(new Parser(), filter, html);
- System.out.println("=========================过滤注释后HTML==================================");
- System.out.println(html);
- }
- public static void main(String[] args) throws ParserException,
- URISyntaxException, IOException {
- HtmlParserImpl parseHtmlTool = new HtmlParserImpl(new CmsAcquisition());
- //parseHtmlTool.testParseParam();
- //parseHtmlTool.testFetchLinkAndTitle();
- //parseHtmlTool.testFetchContent();
- //parseHtmlTool.testRemarkFilter();
- }
- }
- 采集参数封装bean ParamBean.java
- package com.jeecms.common.crawler;
- import java.util.HashMap;
- import java.util.Map;
- /**
- * 采集参数封装bean
- * @author javacoo
- * @since 2011-10-31
- */
- public class ParamBean {
- /**待采集连接区域属性MAP*/
- private Map<String, String> linksetStartMap = new HashMap<String, String>();
- /**待采集连接区域过滤属性MAP*/
- private Map<String, String> linksetEndMap = new HashMap<String, String>();
- /**待采集内容区域属性MAP*/
- private Map<String, String> contentStartMap = new HashMap<String, String>();
- /**待采集内容区域过滤属性MAP*/
- private Map<String, String> contentEndMap = new HashMap<String, String>();
- public Map<String, String> getLinksetStartMap() {
- return linksetStartMap;
- }
- public void setLinksetStartMap(Map<String, String> linksetStartMap) {
- this.linksetStartMap = linksetStartMap;
- }
- public Map<String, String> getLinksetEndMap() {
- return linksetEndMap;
- }
- public void setLinksetEndMap(Map<String, String> linksetEndMap) {
- this.linksetEndMap = linksetEndMap;
- }
- public Map<String, String> getContentStartMap() {
- return contentStartMap;
- }
- public void setContentStartMap(Map<String, String> contentStartMap) {
- this.contentStartMap = contentStartMap;
- }
- public Map<String, String> getContentEndMap() {
- return contentEndMap;
- }
- public void setContentEndMap(Map<String, String> contentEndMap) {
- this.contentEndMap = contentEndMap;
- }
- }
- 队列 Queue.java
- package com.jeecms.common.crawler;
- import java.util.LinkedList;
- /**
- * 队列
- * @author javacoo
- * @since 2011-11-01
- * @param <T>
- */
- public class Queue<T> {
- private LinkedList<T> queue = new LinkedList<T>();
- /**
- * 入队列
- * @param t
- */
- public void enQueue(T t){
- queue.addLast(t);
- }
- /**
- * 出队列
- * @return t
- */
- public T deQueue(){
- return queue.removeFirst();
- }
- /**
- * 判断队列是否为空
- * @return
- */
- public boolean isEmpty(){
- return queue.isEmpty();
- }
- /**
- * 判断队列是否含有t
- * @param t
- * @return
- */
- public boolean contains(T t){
- return queue.contains(t);
- }
- /**
- * 取得队列大小
- * @return
- */
- public int getSize(){
- return queue.size();
- }
- }
- URL队列 UrlQueue.java
- package com.jeecms.common.crawler;
- import java.util.HashSet;
- import java.util.Map;
- import java.util.Set;
- import org.springframework.util.CollectionUtils;
- /**
- * URL队列
- * @author javacoo
- * @since 2011-11-01
- * @param <Map<String, String>>
- */
- public class UrlQueue {
- /**已访问URL集合*/
- private Set<Map<String, String>> visitedUrl = new HashSet<Map<String, String>>();
- /**待访问URL集合*/
- private Queue<Map<String, String>> unVisitedUrl = new Queue<Map<String, String>>();
- /**
- * 获得 URL 队列
- * @return
- */
- public Queue<Map<String, String>> getUnVisitedUrl() {
- return unVisitedUrl;
- }
- /**
- * 未访问的 URL 出队列
- * @return
- */
- public Map<String, String> unVisitedUrlDeQueue() {
- return unVisitedUrl.deQueue();
- }
- /**
- * 保证每个 URL 只被访问一次
- * @param url
- */
- public void addUnVisitedUrl(Map<String, String> urlMap) {
- if (!CollectionUtils.isEmpty(urlMap) && !unVisitedUrl.contains(urlMap) && !visitedUrl.contains(urlMap)){
- unVisitedUrl.enQueue(urlMap);
- }
- }
- /**
- * 判断是否为空
- * @return
- */
- public boolean isEmpty(){
- return unVisitedUrl.isEmpty();
- }
- /**
- * 未访问URL数量
- * @return
- */
- public int getUnVisitedUrlNum(){
- return unVisitedUrl.getSize();
- }
- /**
- * 添加到访问过的URL队列中
- * @param urlMap
- */
- public void addVisitedUrl(Map<String, String> urlMap){
- visitedUrl.add(urlMap);
- }
- /**
- * 删除访问过的URL
- * @param urlMap
- */
- public void removeVisitedUrl(Map<String, String> urlMap){
- visitedUrl.remove(urlMap);
- }
- /**
- * 已访问URL数量
- * @return
- */
- public int getVisitedUrlNum(){
- return visitedUrl.size();
- }
- }
- 接下来是XML配置
- ==============================定时任务模块XML配置====================================
- dao配置
- <bean id="cmsSchedulerDao" class="com.jeecms.cms.dao.assist.impl.CmsSchedulerDaoImpl"/>
- manage配置
- <bean id="cmsSchedulerMng" class="com.jeecms.cms.manager.assist.impl.CmsSchedulerMngImpl"/>
- SERVICE配置
- <bean id="schedulerAcquisitionSvc" class="com.jeecms.cms.service.scheduler.SchedulerAcquisitionSvcImpl"/>
- <bean id="schedulerTaskManageSvc" class="com.jeecms.cms.service.scheduler.SchedulerTaskManageSvcImpl"/>
- 接下来是messages_zh_CN.properties 添加了常量
- ==============================messages_zh_CN.properties====================================
- cmsScheduler.acquisition.function=\u91C7\u96C6\u4EFB\u52A1\u7BA1\u7406
- cmsScheduler.name=\u4EFB\u52A1\u540D\u79F0
- cmsScheduler.expression=\u8BA1\u5212\u8868\u8FBE\u5F0F
- cmsScheduler.expression.help=\u53C2\u6570\u4EE5\u9017\u53F7\u5206\u9694,*\u53F7\u8868\u793A\u65E0\u503C,\u51716\u4F4D\:\u6BCF\u4E2A\u6708\u7684\u7B2C\u51E0\u5468,\u6BCF\u5468\u7684\u7B2C\u51E0\u5929,\u5929(\u51E0\u53F7),\u5C0F\u65F6(24\u5C0F\u65F6\u5236),\u5206\u949F,\u79D2\u3002\u5982\uFF1A1,6,4,15,20,30 \u8868\u793A \u4ECE\u4ECA\u5929\u768415\:20\:30\u5F00\u59CB\uFF0C\u6BCF\u9694\u4E00\u4E2A\u6708\u6267\u884C\u4E00\u6B21,\u5373\u4E0B\u6B21\u6267\u884C\u65F6\u95F4\u662F \u4E0B\u4E2A\u6708\u7684\u7B2C\u4E00\u5468\u7684\u7B2C6\u5929\u768415\:20\:30
- cmsScheduler.associate=\u5173\u8054\u4EFB\u52A1
- cmsScheduler.status.0=\u505C\u6B62
- cmsScheduler.status.1=\u8FD0\u884C
- cmsScheduler.opt.start=\u5F00\u59CB
- cmsScheduler.opt.end=\u505C\u6B62
- cmsScheduler.status=\u72B6\u6001
- cmsScheduler.startTime=\u5F00\u59CB\u65F6\u95F4
- cmsScheduler.endTime=\u7ED3\u675F\u65F6\u95F4
- cmsScheduler.log.delete=\u5220\u9664\u4EFB\u52A1
- ==============================模板====================================
- scheduler/add.html
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
- <html xmlns="http://www.w3.org/1999/xhtml">
- <head>
- <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
- <title></title>
- <#include "/jeecms_sys/head.html"/>
- <script type="text/javascript">
- $.validator.methods.leafChannel = function(value, element, param) {
- var i = element.selectedIndex;
- return $(element.options[i]).attr("class")!="sel-disabled";
- };
- $(function() {
- $("#jvForm").validate({
- rules: {
- channelId: {
- required: true,
- leafChannel: true
- }
- },
- messages:{
- channelId: {
- leafChannel: "<@s.m "cmsAcquisition.error.notLeafChannel"/>"
- }
- }
- });
- });
- </script>
- <style type="text/css">
- .sel-disabled{background-color:#ccc}
- </style>
- </head>
- <body>
- <div class="body-box">
- <div class="rhead">
- <div class="rpos"><@s.m "global.position"/>: <@s.m "cmsScheduler.acquisition.function"/> - <@s.m "global.add"/></div>
- <form class="ropt">
- <input type="hidden" name="moduleType" value="${moduleType!}" />
- <input type="submit" value="<@s.m "global.backToList"/>" onclick="this.form.action='v_listBy.do';"/>
- </form>
- <div class="clear"></div>
- </div>
- <@p.form id="jvForm" action="o_save.do" labelWidth="12">
- <input type="hidden" name="moduleType" value="${moduleType!}" />
- <@p.text colspan="1" width="50" label="cmsScheduler.name" name="name" required="true" class="required" maxlength="50"/>
- <@p.td colspan="1" width="50" label="cmsScheduler.associate" required="true">
- <@p.select list=schedulerTaskList name="associateId" listKey="id" listValue="name"/>
- </@p.td><@p.tr/>
- <@p.textarea colspan="2" label="cmsScheduler.expression" name="expression" help="cmsScheduler.expression.help" helpPosition="3" rows="1" cols="70" required="true" class="required" /><@p.tr/>
- <@p.td colspan="2"><@p.submit code="global.submit"/> <@p.reset code="global.reset"/></@p.td>
- </@p.form>
- </div>
- </body>
- </html>
- scheduler/edit.html
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
- <html xmlns="http://www.w3.org/1999/xhtml">
- <head>
- <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
- <title></title>
- <#include "/jeecms_sys/head.html"/>
- <script type="text/javascript">
- $.validator.methods.leafChannel = function(value, element, param) {
- var i = element.selectedIndex;
- return $(element.options[i]).attr("class")!="sel-disabled";
- };
- $(function() {
- $("#jvForm").validate({
- rules: {
- channelId: {
- required: true,
- leafChannel: true
- }
- },
- messages:{
- channelId: {
- leafChannel: "<@s.m "cmsAcquisition.error.notLeafChannel"/>"
- }
- }
- });
- });
- </script>
- <style type="text/css">
- .sel-disabled{background-color:#ccc}
- </style>
- </head>
- <body>
- <div class="body-box">
- <div class="rhead">
- <div class="rpos"><@s.m "global.position"/>: <@s.m "cmsScheduler.acquisition.function"/> - <@s.m "global.edit"/></div>
- <form class="ropt">
- <input type="button" value="<@s.m "global.backToList"/>" onclick="history.back();"/>
- </form>
- <div class="clear"></div>
- </div>
- <@p.form id="jvForm" action="o_update.do" labelWidth="12">
- <input type="hidden" name="moduleType" value="${cmsScheduler.moduleType!}" />
- <@p.text colspan="1" width="50" label="cmsScheduler.name" name="name" value=cmsScheduler.name required="true" class="required" maxlength="50"/>
- <@p.td colspan="1" width="50" label="cmsScheduler.associate" required="true">
- <@p.select list=schedulerTaskList name="associateId" value=cmsScheduler.associateId listKey="id" listValue="name"/>
- </@p.td><@p.tr/>
- <@p.textarea colspan="2" label="cmsScheduler.expression" name="expression" rows="1" help="cmsScheduler.expression.help" helpPosition="3" value=cmsScheduler.expression required="true" class="required" cols="70" /><@p.tr/>
- <@p.td colspan="2">
- <@p.hidden name="id" value=cmsScheduler.id/>
- <@p.submit code="global.submit"/> <@p.reset code="global.reset"/>
- </@p.td>
- </@p.form>
- </div>
- </body>
- </html>
- scheduler/list.html
- <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
- <html xmlns="http://www.w3.org/1999/xhtml">
- <head>
- <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
- <title></title>
- <#include "/jeecms_sys/head.html"/>
- <script type="text/javascript">
- function getTableForm() {
- return document.getElementById('tableForm');
- }
- function optDelete() {
- if(Pn.checkedCount('ids')<=0) {
- alert("<@s.m 'error.checkRecord'/>");
- return;
- }
- if(!confirm("<@s.m 'global.confirm.delete'/>")) {
- return;
- }
- var f = getTableForm();
- f.action="o_delete.do";
- f.submit();
- }
- </script>
- </head>
- <body>
- <div class="body-box">
- <div class="rhead">
- <div class="rpos"><@s.m "global.position"/>: <@s.m "cmsScheduler.acquisition.function"/> - <@s.m "global.list"/></div>
- <form class="ropt">
- <input type="hidden" name="moduleType" value="${moduleType!}" />
- <input type="submit" value="<@s.m "global.add"/>" onclick="this.form.action='v_add.do';"/>
- </form>
- <div class="clear"></div>
- </div>
- <form id="tableForm" method="post">
- <input type="hidden" name="pageNo" value="${pageNo!}"/>
- <@p.table value=list;cmsScheduler,i,has_next><#rt/>
- <@p.column title="<input type='checkbox' onclick='Pn.checkbox(\"ids\",this.checked)'/>" width="20">
- <input type='checkbox' name='ids' value='${cmsScheduler.id}'/><#t/>
- </@p.column><#t/>
- <@p.column title="ID">${cmsScheduler.id}</@p.column><#t/>
- <@p.column code="cmsScheduler.name">${cmsScheduler.name}</@p.column><#t/>
- <@p.column code="cmsScheduler.status" align="center"><#if cmsScheduler.status==1><strong style="color:red"></#if><@s.m "cmsScheduler.status."+cmsScheduler.status/><#if cmsScheduler.status==1></strong></#if></@p.column><#t/>
- <@p.column code="cmsScheduler.startTime" align="center">${(cmsScheduler.startTime?string('yyyy-MM-dd HH:mm:ss'))!}</@p.column><#t/>
- <@p.column code="cmsScheduler.endTime" align="center">${(cmsScheduler.endTime?string('yyyy-MM-dd HH:mm:ss'))!}</@p.column><#t/>
- <@p.column code="global.operate" align="center">
- <#if cmsScheduler.status==0>
- <a href="o_start.do?id=${cmsScheduler.id}" class="pn-opt"><@s.m "cmsScheduler.opt.start"/></a> | <#rt/>
- <#else>
- <@s.m "cmsScheduler.opt.start"/> | <#rt/>
- </#if>
- <#if cmsScheduler.status==1 || cmsScheduler.status==1>
- <a href="o_end.do?id=${cmsScheduler.id}" class="pn-opt"><@s.m "cmsScheduler.opt.end"/></a> | <#rt/>
- <#else>
- <@s.m "cmsScheduler.opt.end"/> | <#rt/>
- </#if>
- <a href="v_edit.do?id=${cmsScheduler.id}" class="pn-opt"><@s.m "global.edit"/></a> | <#rt/>
- <a href="o_delete.do?ids=${cmsScheduler.id}&moduleType=${cmsScheduler.moduleType}" class="pn-opt" onclick="if(!confirm('<@s.m "global.confirm.delete"/>')) {return false;}"><@s.m "global.delete"/></a><#t/>
- </@p.column><#t/>
- </@p.table>
- <div><input type="button" value="<@s.m "global.delete"/>" onclick="optDelete();"/></div>
- </form>
- </div>
- <#include "/common/alert_message.html"/>
- </body>
- </html>
- generate_left.html 有修改
- 加上
- <@cms_perm url="/scheduler/v_listBy.do">
- <li><a href="../scheduler/v_listBy.do?moduleType=schedulerAcquisitionSvc" target="rightFrame"><@s.m "cmsScheduler.acquisition.function"/></a></li>
- </@cms_perm>