【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析

在创建一个job后，就要开始job的运行，运行的全流程如下：

1、在界面上启动job

2、index.jsp

查看上述页面对应的源代码

<a href='"+request.getContextPath()+"/console/action.jsp?action=start'>Start</a>

3、action.jsp

    String sAction = request.getParameter("action");

    if(sAction != null)

    {

        // Need to handle an action

        if(sAction.equalsIgnoreCase("start"))

        {

            // Tell handler to start crawl job

            handler.startCrawler();

        } else if(sAction.equalsIgnoreCase("stop")) {

            // Tell handler to stop crawl job

            handler.stopCrawler();

        } else if(sAction.equalsIgnoreCase("terminate")) {

            // Delete current job

            if(handler.getCurrentJob()!=null){

                handler.deleteJob(handler.getCurrentJob().getUID());

            }

        } else if(sAction.equalsIgnoreCase("pause")) {

            // Tell handler to pause crawl job

            handler.pauseJob();

        } else if(sAction.equalsIgnoreCase("resume")) {

            // Tell handler to resume crawl job

            handler.resumeJob();

        } else if(sAction.equalsIgnoreCase("checkpoint")) {

            if(handler.getCurrentJob() != null) {

                handler.checkpointJob();

            }

        }

    }

    response.sendRedirect(request.getContextPath() + "/index.jsp");

4、CrawlJobHandler.jsp

（1）

    public void startCrawler() {

        running = true;

        if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {

            // Ok, can just start the next job

            startNextJob();

        }

    }

（2）

    protected final void startNextJob() {

        synchronized (this) {

            if(startingNextJob != null) {

                try {

                    startingNextJob.join();

                } catch (InterruptedException e) {

                    e.printStackTrace();

                    return;

                }

            }

            startingNextJob = new Thread(new Runnable() {

                public void run() {

                    startNextJobInternal();

                }

            }, "StartNextJob");

            startingNextJob.start();

        }

    }

（3）

   protected void startNextJobInternal() {

        if (pendingCrawlJobs.size() == 0 || isCrawling()) {

            // No job ready or already crawling.

            return;

        }

        this.currentJob = (CrawlJob)pendingCrawlJobs.first();

        assert pendingCrawlJobs.contains(currentJob) :

            "pendingCrawlJobs is in an illegal state";

        pendingCrawlJobs.remove(currentJob);

        try {

            this.currentJob.setupForCrawlStart();

            // This is ugly but needed so I can clear the currentJob

            // reference in the crawlEnding and update the list of completed

            // jobs.  Also, crawlEnded can startup next job.

            this.currentJob.getController().addCrawlStatusListener(this);

            // now, actually start

            this.currentJob.getController().requestCrawlStart();

        } catch (InitializationException e) {

            loadJob(getStateJobFile(this.currentJob.getDirectory()));

            this.currentJob = null;

            startNextJobInternal(); // Load the next job if there is one.

        }

    }

（4）

    public void requestCrawlStart() {

        runProcessorInitialTasks();

        sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING);

        String jobState;

        state = RUNNING;

        jobState = CrawlJob.STATUS_RUNNING;

        sendCrawlStateChangeEvent(this.state, jobState);

        // A proper exit will change this value.

        this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL;

        Thread statLogger = new Thread(statistics);

        statLogger.setName("StatLogger");

        statLogger.start();

        frontier.start();

    }

秒客网

【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析

相关文章