【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析

时间:2024-04-28 12:07:36

在创建一个job后,就要开始job的运行,运行的全流程如下:

1、在界面上启动job

【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析

2、index.jsp

查看上述页面对应的源代码

<a href='"+request.getContextPath()+"/console/action.jsp?action=start'>Start</a>

3、action.jsp

    String sAction = request.getParameter("action");
if(sAction != null)
{
// Need to handle an action
if(sAction.equalsIgnoreCase("start"))
{
// Tell handler to start crawl job
handler.startCrawler();
} else if(sAction.equalsIgnoreCase("stop")) {
// Tell handler to stop crawl job
handler.stopCrawler();
} else if(sAction.equalsIgnoreCase("terminate")) {
// Delete current job
if(handler.getCurrentJob()!=null){
handler.deleteJob(handler.getCurrentJob().getUID());
}
} else if(sAction.equalsIgnoreCase("pause")) {
// Tell handler to pause crawl job
handler.pauseJob();
} else if(sAction.equalsIgnoreCase("resume")) {
// Tell handler to resume crawl job
handler.resumeJob();
} else if(sAction.equalsIgnoreCase("checkpoint")) {
if(handler.getCurrentJob() != null) {
handler.checkpointJob();
}
}
}
response.sendRedirect(request.getContextPath() + "/index.jsp");

4、CrawlJobHandler.jsp

(1)

    public void startCrawler() {
running = true;
if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
// Ok, can just start the next job
startNextJob();
}
}

(2)

    protected final void startNextJob() {
synchronized (this) {
if(startingNextJob != null) {
try {
startingNextJob.join();
} catch (InterruptedException e) {
e.printStackTrace();
return;
}
}
startingNextJob = new Thread(new Runnable() {
public void run() {
startNextJobInternal();
}
}, "StartNextJob");
startingNextJob.start();
}
}

(3)

   protected void startNextJobInternal() {
if (pendingCrawlJobs.size() == 0 || isCrawling()) {
// No job ready or already crawling.
return;
}
this.currentJob = (CrawlJob)pendingCrawlJobs.first();
assert pendingCrawlJobs.contains(currentJob) :
"pendingCrawlJobs is in an illegal state";
pendingCrawlJobs.remove(currentJob);
try {
this.currentJob.setupForCrawlStart();
// This is ugly but needed so I can clear the currentJob
// reference in the crawlEnding and update the list of completed
// jobs. Also, crawlEnded can startup next job.
this.currentJob.getController().addCrawlStatusListener(this);
// now, actually start
this.currentJob.getController().requestCrawlStart();
} catch (InitializationException e) {
loadJob(getStateJobFile(this.currentJob.getDirectory()));
this.currentJob = null;
startNextJobInternal(); // Load the next job if there is one.
}
}

(4)

    public void requestCrawlStart() {
runProcessorInitialTasks(); sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING);
String jobState;
state = RUNNING;
jobState = CrawlJob.STATUS_RUNNING;
sendCrawlStateChangeEvent(this.state, jobState); // A proper exit will change this value.
this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL; Thread statLogger = new Thread(statistics);
statLogger.setName("StatLogger");
statLogger.start(); frontier.start();
}