spark1.2.0源码分析之spark streaming处理数据

时间:2021-08-25 20:47:57

在JobScheduler的start()方法中,有一句关键代码会被调用:jobGenerator.start(),这个就是生成job来进行数据的处理,代码如下:

  /** Start generation of jobs */
def start(): Unit = synchronized {
if (eventActor != null) return // generator has already been started

eventActor = ssc.env.actorSystem.actorOf(Props(new Actor {
def receive = {
case event: JobGeneratorEvent => processEvent(event) //创建Actor实例,处理JobGeneratorEvent事件
}
}), "JobGenerator")
if (ssc.isCheckpointPresent) { //之前有checkpoint的数据
restart()
} else {
startFirstTime() //第一次时会执行
}
}
考虑第一次执行的情况:startFirstTime(),跟踪下去:

  /** Starts the generator for the first time */
private def startFirstTime() {
val startTime = new Time(timer.getStartTime()) //定时器,getStartTime()得到的是下一个周期到来的时间
graph.start(startTime - graph.batchDuration) //batchDuration就是构造StreamingContext时的批处理时间
timer.start(startTime.milliseconds) //启动定时器
logInfo("Started JobGenerator at " + startTime)
}
Timer的定义如下:

  private val timer = new RecurringTimer(clock, ssc.graph.batchDuration.milliseconds,
longTime => eventActor ! GenerateJobs(new Time(longTime)), "JobGenerator")
timer.start() 的调用如下:

  def start(startTime: Long): Long = synchronized {
nextTime = startTime
thread.start() //启动一个线程
logInfo("Started timer for " + name + " at time " + nextTime)
nextTime
}
开启一个线程,实际上调用:

  private val thread = new Thread("RecurringTimer - " + name) {
setDaemon(true)
override def run() { loop }
}
  /**   * Repeatedly call the callback every interval.   */  private def loop() {    try {      while (!stopped) {        clock.waitTillTime(nextTime)        callback(nextTime)  //回调函数        prevTime = nextTime        nextTime += period        logDebug("Callback for " + name + " called at time " + prevTime)      }    } catch {      case e: InterruptedException =>    }  }
最后调用的回调函数为:eventActor ! GenerateJobs(new Time(longTime)),定时给 eventActor 发送消息,调用代码如下:

  /** Processes all events */
private def processEvent(event: JobGeneratorEvent) {
logDebug("Got event " + event)
event match {
case GenerateJobs(time) => generateJobs(time)
case ClearMetadata(time) => clearMetadata(time)
case DoCheckpoint(time) => doCheckpoint(time)
case ClearCheckpointData(time) => clearCheckpointData(time)
}
}
根据发送的消息类型,会调用 generateJobs(time)方法:

  /** Generate jobs and perform checkpoint for the given `time`.  */
private def generateJobs(time: Time) {
// Set the SparkEnv in this thread, so that job generation code can access the environment
// Example: BlockRDDs are created in this thread, and it needs to access BlockManager
// Update: This is probably redundant after threadlocal stuff in SparkEnv has been removed.
SparkEnv.set(ssc.env)
Try {
jobScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch
graph.generateJobs(time) // generate jobs using allocated block //首先生成jobs
} match {
case Success(jobs) =>
val receivedBlockInfos =
jobScheduler.receiverTracker.getBlocksOfBatch(time).mapValues { _.toArray }
jobScheduler.submitJobSet(JobSet(time, jobs, receivedBlockInfos)) //以JobSet的方式提交
case Failure(e) =>
jobScheduler.reportError("Error generating jobs for time " + time, e)
}
eventActor ! DoCheckpoint(time)
}

继续跟踪关键的一步:graph.generateJobs(time) ,代码如下:

  def generateJobs(time: Time): Seq[Job] = {
logDebug("Generating jobs for time " + time)
val jobs = this.synchronized {
outputStreams.flatMap(outputStream => outputStream.generateJob(time)) //outputStreams在调用DStream.print()方法时会传递进来数据
}
logDebug("Generated " + jobs.length + " jobs for time " + time)
jobs
}
继续跟踪generateJob(time)方法:

  /**
* Generate a SparkStreaming job for the given time. This is an internal method that
* should not be called directly. This default implementation creates a job
* that materializes the corresponding RDD. Subclasses of DStream may override this
* to generate their own jobs.
*/
private[streaming] def generateJob(time: Time): Option[Job] = {
getOrCompute(time) match {
case Some(rdd) => {
val jobFunc = () => {
val emptyFunc = { (iterator: Iterator[T]) => {} } //该方法什么都没做
context.sparkContext.runJob(rdd, emptyFunc)
}
Some(new Job(time, jobFunc))
}
case None => None
}
}
可以发现,这个方法什么都没做,回到outputStream里,当我们调用DStream.print()方法时,会生成一个 ForEachDStream对象,因此,看一下它里面的方法,发现里面复写了 generateJob() ,具体如下:

  override def generateJob(time: Time): Option[Job] = {
parent.getOrCompute(time) match {
case Some(rdd) =>
val jobFunc = () => {
ssc.sparkContext.setCallSite(creationSite)
foreachFunc(rdd, time) //将rdd封装到该函数里
}
Some(new Job(time, jobFunc)) //返回job,里面包含了rdd信息
case None => None
}
}
继续跟踪 getOrComputer(time) 方法:

  /**
* Get the RDD corresponding to the given time; either retrieve it from cache
* or compute-and-cache it.
*/
private[streaming] def getOrCompute(time: Time): Option[RDD[T]] = {
// If RDD was already generated, then retrieve it from HashMap,
// or else compute the RDD
generatedRDDs.get(time).orElse {
// Compute the RDD if time is valid (e.g. correct time in a sliding window)
// of RDD generation, else generate nothing.
if (isTimeValid(time)) {
// Set the thread-local property for call sites to this DStream's creation site
// such that RDDs generated by compute gets that as their creation site.
// Note that this `getOrCompute` may get called from another DStream which may have
// set its own call site. So we store its call site in a temporary variable,
// set this DStream's creation site, generate RDDs and then restore the previous call site.
val prevCallSite = ssc.sparkContext.getCallSite()
ssc.sparkContext.setCallSite(creationSite)
val rddOption = compute(time) //真正的计算工作,生成rdd
ssc.sparkContext.setCallSite(prevCallSite)

rddOption.foreach { case newRDD =>
// Register the generated RDD for caching and checkpointing
if (storageLevel != StorageLevel.NONE) { //如果有指定存储级别
newRDD.persist(storageLevel)
logDebug(s"Persisting RDD ${newRDD.id} for time $time to $storageLevel")
}
if (checkpointDuration != null && (time - zeroTime).isMultipleOf(checkpointDuration)) {
newRDD.checkpoint() //checkpoint的情况
logInfo(s"Marking RDD ${newRDD.id} for time $time for checkpointing")
}
generatedRDDs.put(time, newRDD) //放入一个HashMap中,以便重用
}
rddOption
} else {
None
}
}
}
继续看其 computer(time) 方法,发现在DStream中没有实现,由于我们在构造时产生的是socket流,具体实现的子类为 SocketInputDStream,发现没有computer方法,查看其父类 ReceiverInputDStream,发现在父类中实现了,具体代码如下:

  /**
* Generates RDDs with blocks received by the receiver of this stream. */
override def compute(validTime: Time): Option[RDD[T]] = {
val blockRDD = {

if (validTime < graph.startTime) { //计算的时间比开始时间早,返回空的RDD
// If this is called for any time before the start time of the context,
// then this returns an empty RDD. This may happen when recovering from a
// driver failure without any write ahead log to recover pre-failure data.
new BlockRDD[T](ssc.sc, Array.empty)
} else {
// Otherwise, ask the tracker for all the blocks that have been allocated to this stream
// for this batch
val blockInfos =
ssc.scheduler.receiverTracker.getBlocksOfBatch(validTime).get(id).getOrElse(Seq.empty) //去ReceiverTracker中获取块信息
val blockStoreResults = blockInfos.map { _.blockStoreResult }
val blockIds = blockStoreResults.map { _.blockId.asInstanceOf[BlockId] }.toArray

// Check whether all the results are of the same type
val resultTypes = blockStoreResults.map { _.getClass }.distinct
if (resultTypes.size > 1) {
logWarning("Multiple result types in block information, WAL information will be ignored.") //检测是否所有块都是相同的类型
}

// If all the results are of type WriteAheadLogBasedStoreResult, then create
// WriteAheadLogBackedBlockRDD else create simple BlockRDD.
if (resultTypes.size == 1 && resultTypes.head == classOf[WriteAheadLogBasedStoreResult]) {
val logSegments = blockStoreResults.map {
_.asInstanceOf[WriteAheadLogBasedStoreResult].segment
}.toArray
// Since storeInBlockManager = false, the storage level does not matter.
new WriteAheadLogBackedBlockRDD[T](ssc.sparkContext,
blockIds, logSegments, storeInBlockManager = true, StorageLevel.MEMORY_ONLY_SER) //WriteAheadLogBackedBlockRDD类型的RDD
} else {
new BlockRDD[T](ssc.sc, blockIds) //普通的RDD
}
}
}
Some(blockRDD)
}

继续回到之前的JobGenerator 的 generateJobs() 方法里,生成完jobs之后,会调用:jobScheduler.submitJobSet(JobSet(time, jobs, receivedBlockInfos)),源码如下:

  def submitJobSet(jobSet: JobSet) {
if (jobSet.jobs.isEmpty) {
logInfo("No jobs added for time " + jobSet.time)
} else {
jobSets.put(jobSet.time, jobSet)
jobSet.jobs.foreach(job => jobExecutor.execute(new JobHandler(job))) //以线程池里的线程来运行每个job
logInfo("Added jobs for time " + jobSet.time)
}
}
查看JobHandle(job) 方法:

  private class JobHandler(job: Job) extends Runnable {
def run() {
eventActor ! JobStarted(job)
job.run() //
eventActor ! JobCompleted(job)
}
}
查看job.run() ,如下:

  def run() {
result = Try(func()) //func()就是调用print方法时生成的
}
func() 具体实现如下:

    def foreachFunc = (rdd: RDD[T], time: Time) => {
val first11 = rdd.take(11)
println ("-------------------------------------------")
println ("Time: " + time)
println ("-------------------------------------------")
first11.take(10).foreach(println)
if (first11.size > 10) println("...")
println()
}

 **************   The End   **************