spark1.1.0源码阅读-dagscheduler and stage

1. rdd action ->sparkContext.runJob->dagscheduler.runJob

 1   def runJob[T, U: ClassTag](
 2       rdd: RDD[T],
 3       func: (TaskContext, Iterator[T]) => U,
 4       partitions: Seq[Int],
 5       callSite: String,
 6       allowLocal: Boolean,
 7       resultHandler: (Int, U) => Unit,
 8       properties: Properties = null)
 9   {
10     val waiter = submitJob(rdd, func, partitions, callSite, allowLocal, resultHandler, properties)
11     waiter.awaitResult() match {
12       case JobSucceeded => {}
13       case JobFailed(exception: Exception) =>
14         logInfo("Failed to run " + callSite)
15         throw exception
16     }
17   }

2. sumbitJob

 1   /**
 2    * Submit a job to the job scheduler and get a JobWaiter object back. The JobWaiter object
 3    * can be used to block until the the job finishes executing or can be used to cancel the job.
 4    */
 5   def submitJob[T, U](
 6       rdd: RDD[T],
 7       func: (TaskContext, Iterator[T]) => U,
 8       partitions: Seq[Int],
 9       callSite: String,
10       allowLocal: Boolean,
11       resultHandler: (Int, U) => Unit,
12       properties: Properties = null): JobWaiter[U] =
13   {
14     // Check to make sure we are not launching a task on a partition that does not exist.
15     val maxPartitions = rdd.partitions.length
16     partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
17       throw new IllegalArgumentException(
18         "Attempting to access a non-existent partition: " + p + ". " +
19           "Total number of partitions: " + maxPartitions)
20     }
21 
22     val jobId = nextJobId.getAndIncrement()
23     if (partitions.size == 0) {
24       return new JobWaiter[U](this, jobId, 0, resultHandler)
25     }
26 
27     assert(partitions.size > 0)
28     val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
29     val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
30     eventProcessActor ! JobSubmitted(
31       jobId, rdd, func2, partitions.toArray, allowLocal, callSite, waiter, properties) //向eventProcessActor发送消息，有个疑问：此处rdd怎么变成message？是将元数据（partition等位置信息）序列化吗？
32     waiter
33   }

3. DAGSchedulerEventProcessActor

 1 private[scheduler] class DAGSchedulerEventProcessActor(dagScheduler: DAGScheduler)
 2   extends Actor with Logging {
 3 
 4   override def preStart() {
 5     // set DAGScheduler for taskScheduler to ensure eventProcessActor is always
 6     // valid when the messages arrive
 7     dagScheduler.taskScheduler.setDAGScheduler(dagScheduler)
 8   }
 9 
10   /**
11    * The main event loop of the DAG scheduler.
12    */
13   def receive = {
14     case JobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite, listener, properties) =>
15       dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, allowLocal, callSite,
16         listener, properties)
17 
18     case StageCancelled(stageId) =>
19       dagScheduler.handleStageCancellation(stageId)
20 
21     case JobCancelled(jobId) =>
22       dagScheduler.handleJobCancellation(jobId)
23 
24     case JobGroupCancelled(groupId) =>
25       dagScheduler.handleJobGroupCancelled(groupId)
26 
27     case AllJobsCancelled =>
28       dagScheduler.doCancelAllJobs()

4. actor调用 handleJobSubmitted

 1   private[scheduler] def handleJobSubmitted(jobId: Int,
 2       finalRDD: RDD[_],
 3       func: (TaskContext, Iterator[_]) => _,
 4       partitions: Array[Int],
 5       allowLocal: Boolean,
 6       callSite: String,
 7       listener: JobListener,
 8       properties: Properties = null)
 9   {
10     var finalStage: Stage = null
11     try {
12       // New stage creation may throw an exception if, for example, jobs are run on a
13       // HadoopRDD whose underlying HDFS files have been deleted.
14       finalStage = newStage(finalRDD, partitions.size, None, jobId, Some(callSite))
15     } catch {
16       case e: Exception =>
17         logWarning("Creating new stage failed due to exception - job: " + jobId, e)
18         listener.jobFailed(e)
19         return
20     }
21     if (finalStage != null) {
22       val job = new ActiveJob(jobId, finalStage, func, partitions, callSite, listener, properties)
23       clearCacheLocs()
24       logInfo("Got job %s (%s) with %d output partitions (allowLocal=%s)".format(
25         job.jobId, callSite, partitions.length, allowLocal))
26       logInfo("Final stage: " + finalStage + "(" + finalStage.name + ")")
27       logInfo("Parents of final stage: " + finalStage.parents)
28       logInfo("Missing parents: " + getMissingParentStages(finalStage))
29       if (allowLocal && finalStage.parents.size == 0 && partitions.length == 1) {
30         // Compute very short actions like first() or take() with no parent stages locally.
31         listenerBus.post(SparkListenerJobStart(job.jobId, Array[Int](), properties))
32         runLocally(job) //如果只有一个parition，而且没有parent，并运行本地运行，则单独起一个线程执行
33       } else {
34         jobIdToActiveJob(jobId) = job
35         activeJobs += job
36         resultStageToJob(finalStage) = job
37         listenerBus.post(SparkListenerJobStart(job.jobId, jobIdToStageIds(jobId).toArray,
38           properties))
39  submitStage(finalStage)
40       }
41     }
42     submitWaitingStages()
43   }

 1   /**
 2    * Create a Stage -- either directly for use as a result stage, or as part of the (re)-creation
 3    * of a shuffle map stage in newOrUsedStage.  The stage will be associated with the provided
 4    * jobId. Production of shuffle map stages should always use newOrUsedStage, not newStage
 5    * directly.
 6    */
 7   private def newStage(
 8       rdd: RDD[_],
 9       numTasks: Int,
10       shuffleDep: Option[ShuffleDependency[_,_]],
11       jobId: Int,
12       callSite: Option[String] = None)
13     : Stage =
14   {
15     val id = nextStageId.getAndIncrement()
16     val stage =
17       new Stage(id, rdd, numTasks, shuffleDep, getParentStages(rdd, jobId), jobId, callSite)
18     stageIdToStage(id) = stage
19     updateJobIdStageIdMaps(jobId, stage)
20     stageToInfos(stage) = StageInfo.fromStage(stage)
21     stage
22   }

 1   /**
 2    * Run a job on an RDD locally, assuming it has only a single partition and no dependencies.
 3    * We run the operation in a separate thread just in case it takes a bunch of time, so that we
 4    * don't block the DAGScheduler event loop or other concurrent jobs.
 5    */
 6   protected def runLocally(job: ActiveJob) {
 7     logInfo("Computing the requested partition locally")
 8     new Thread("Local computation of job " + job.jobId) {
 9       override def run() {
10         runLocallyWithinThread(job)
11       }
12     }.start()
13   }

5. submitStage: 如果parent stage有缺失，

 1   /** Submits stage, but first recursively submits any missing parents. */
 2   private def submitStage(stage: Stage) {
 3     val jobId = activeJobForStage(stage)
 4     if (jobId.isDefined) {
 5       logDebug("submitStage(" + stage + ")")
 6       if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
 7         val missing = getMissingParentStages(stage).sortBy(_.id) 
 8         logDebug("missing: " + missing)
 9         if (missing == Nil) {
10           logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
11  submitMissingTasks(stage, jobId.get) 
12           runningStages += stage
13         } else {
14           for (parent <- missing) {
15             submitStage(parent)
16           }
17           waitingStages += stage
18         }
19       }
20     } else {
21       abortStage(stage, "No active job for stage " + stage.id)
22     }
23   }

 1   private def getMissingParentStages(stage: Stage): List[Stage] = {
 2     val missing = new HashSet[Stage]
 3     val visited = new HashSet[RDD[_]]
 4     def visit(rdd: RDD[_]) {
 5       if (!visited(rdd)) {
 6         visited += rdd
 7         if (getCacheLocs(rdd).contains(Nil)) {//如果cacheLocs包含Nil，则认为此rdd miss了
 8           for (dep <- rdd.dependencies) {
 9             dep match { //分两种情况：shufDep和narrowDep，前者会生成shuffleMapStage，后者会递归访问
10               case shufDep: ShuffleDependency[_,_] =>
11                 val mapStage = getShuffleMapStage(shufDep, stage.jobId)
12                 if (!mapStage.isAvailable) {
13                   missing += mapStage
14                 }
15               case narrowDep: NarrowDependency[_] =>
16                 visit(narrowDep.rdd)
17             }
18           }
19         }
20       }
21     }
22     visit(stage.rdd)
23     missing.toList
24   }

6. submitMissTasks

 1   /** Called when stage's parents are available and we can now do its task. */
 2   private def submitMissingTasks(stage: Stage, jobId: Int) {
 3     logDebug("submitMissingTasks(" + stage + ")")
 4     // Get our pending tasks and remember them in our pendingTasks entry
 5     val myPending = pendingTasks.getOrElseUpdate(stage, new HashSet)
 6     myPending.clear()
 7     var tasks = ArrayBuffer[Task[_]]()
 8     if (stage.isShuffleMap) {
 9       for (p <- 0 until stage.numPartitions if stage.outputLocs(p) == Nil) { //将stage中存储空间outputLocas为Nil的patition生成一个shuffleMapTask
10         val locs = getPreferredLocs(stage.rdd, p)
11         tasks += new ShuffleMapTask(stage.id, stage.rdd, stage.shuffleDep.get, p, locs)
12       }
13     } else {
14       // This is a final stage; figure out its job's missing partitions
15       val job = resultStageToJob(stage)
16       for (id <- 0 until job.numPartitions if !job.finished(id)) {
17         val partition = job.partitions(id)
18         val locs = getPreferredLocs(stage.rdd, partition)
19         tasks += new ResultTask(stage.id, stage.rdd, job.func, partition, locs, id) //生成resultTask
20       }
21     }
22 
23     val properties = if (jobIdToActiveJob.contains(jobId)) {
24       jobIdToActiveJob(stage.jobId).properties
25     } else {
26       // this stage will be assigned to "default" pool
27       null
28     }
29 
30     // must be run listener before possible NotSerializableException
31     // should be "StageSubmitted" first and then "JobEnded"
32     listenerBus.post(SparkListenerStageSubmitted(stageToInfos(stage), properties))
33 
34     if (tasks.size > 0) {
35       // Preemptively serialize a task to make sure it can be serialized. We are catching this
36       // exception here because it would be fairly hard to catch the non-serializable exception
37       // down the road, where we have several different implementations for local scheduler and
38       // cluster schedulers.
39       try {
40         SparkEnv.get.closureSerializer.newInstance().serialize(tasks.head)
41       } catch {
42         case e: NotSerializableException =>
43           abortStage(stage, "Task not serializable: " + e.toString)
44           runningStages -= stage
45           return
46       }
47 
48       logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
49       myPending ++= tasks
50       logDebug("New pending tasks: " + myPending)
51       taskScheduler.submitTasks(
52         new TaskSet(tasks.toArray, stage.id, stage.newAttemptId(), stage.jobId, properties)) //将这些task生成一个taskSet，并调用taskScheduler.submitTasks
53       stageToInfos(stage).submissionTime = Some(System.currentTimeMillis())
54     } else {
55       logDebug("Stage " + stage + " is actually done; %b %d %d".format(
56         stage.isAvailable, stage.numAvailableOutputs, stage.numPartitions))
57       runningStages -= stage
58     }
59   }

7. taskSet: 某个rdd的一部分parition missing了，会通过上面的步骤找到，并将这些partition生成对应的tasks，通过taskSet来一起调度。

 1 /**
 2  * A set of tasks submitted together to the low-level TaskScheduler, usually representing
 3  * missing partitions of a particular stage.
 4  */
 5 private[spark] class TaskSet(
 6     val tasks: Array[Task[_]],
 7     val stageId: Int,
 8     val attempt: Int,
 9     val priority: Int,
10     val properties: Properties) {
11     val id: String = stageId + "." + attempt
12 
13   def kill(interruptThread: Boolean) {
14     tasks.foreach(_.kill(interruptThread))
15   }
16 
17   override def toString: String = "TaskSet " + id
18 }

8. taskScheduler.submitTasks

 1   override def submitTasks(taskSet: TaskSet) {
 2     val tasks = taskSet.tasks
 3     logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
 4     this.synchronized {
 5       val manager = new TaskSetManager(this, taskSet, maxTaskFailures)
 6       activeTaskSets(taskSet.id) = manager
 7       schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
 8 
 9       if (!isLocal && !hasReceivedTask) {
10         starvationTimer.scheduleAtFixedRate(new TimerTask() {
11           override def run() {
12             if (!hasLaunchedTask) {
13               logWarning("Initial job has not accepted any resources; " +
14                 "check your cluster UI to ensure that workers are registered " +
15                 "and have sufficient memory")
16             } else {
17               this.cancel()
18             }
19           }
20         }, STARVATION_TIMEOUT, STARVATION_TIMEOUT)
21       }
22       hasReceivedTask = true
23     }
24     backend.reviveOffers()
25   }

秒客网

spark1.1.0源码阅读-dagscheduler and stage

相关文章