Spark Streaming是将流式计算分解成一系列短小的批处理作业。这里的批处理引擎是Spark Engine,也就是把Spark Streaming的输入数据按照batch size(如1秒)分成一段一段的数据(Discretized Stream),每一段数据都转换成Spark中的RDD(Resilient Distributed Dataset),然后将Spark Streaming中对DStream的Transformation操作变为针对Spark中对RDD的Transformation操作,将RDD经过操作变成中间结果保存在内存中。整个流式计算根据业务的需求可以对中间的结果进行叠加,或者存储到外部设备。Spark Streaming属于Spark的核心api,它支持高吞吐量、支持容错的实时流数据处理。它可以接受来自Kafka, Flume, Twitter, ZeroMQ和TCP Socket的数据源,使用简单的api函数比如 map, reduce, join, window等操作,还可以直接使用内置的机器学习算法、图算法包来处理数据。
Spark Streaming基本操作:
import org.apache.log4j.Level
import org.apache.spark.SparkContext
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext._
import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
object SparkStreaming {
def main(args: Array[String]) {
val sc = new SparkContext("spark://centos.host1:7077", "Spark Streaming")
//创建StreamingContext,20秒一个批次
val ssc = new StreamingContext(sc, Seconds(20))
//获得一个DStream来负责TCP连接(监听端口:地址)
val serverIP = "localhost"
val serverPort = 9999
val lines = ssc.socketTextStream(serverIP, serverPort)
val rdd1 = lines.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
//打印到控制台
rdd1.print()
//获得一个InputDStream来负责监听文件目录
val dataDirectory = "/user/hadoop/data/temp/streaming/";
val inputDStream1 = ssc.fileStream[LongWritable, Text, TextInputFormat](dataDirectory)
val rdd2 = inputDStream1.flatMap(_._2.toString().split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
rdd2.print()
val inputDStream2 = ssc.textFileStream(dataDirectory)
val rdd3 = inputDStream2.flatMap(_.toString().split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
rdd3.print()
//特定的窗口操作,窗口操作涉及两个参数:一个是滑动窗口的宽度(Window Duration);另一个是窗口滑动的频率(Slide Duration)
val inputDStream3 = ssc.fileStream[LongWritable, Text, TextInputFormat](dataDirectory)
val rdd4 = inputDStream3.flatMap(_._2.toString().split(" ")).map(word => (word, 1)).reduceByKeyAndWindow(
(x:Int, y:Int) => (x + y), Seconds(40), Seconds(20))
rdd4.print()
//保存流的内容,文件默认会保存在用户的目录下
//保存流的内容为SequenceFile, 文件目录名 : "prefix-TIME_IN_MS.suffix" rdd-1411894750000.seq
rdd4.saveAsObjectFiles("/user/hadoop/data/temp/rdd", "seq")
//保存流的内容为TextFile, 文件目录名 : "prefix-TIME_IN_MS.suffix" rdd-1411894750000.txt
rdd4.saveAsTextFiles("/user/hadoop/data/temp/rdd", "txt")
//保存流的内容为HadoopFile, 文件目录名 : "prefix-TIME_IN_MS.suffix" rdd-1411894750000.hadoop
//这个API暂时没有正确使用出来
//rdd4.saveAsHadoopFiles("/user/hadoop/data/temp/rdd", "hadoop")
//rdd4.saveAsHadoopFiles("/user/hadoop/data/temp/rdd", "hadoop", Text.class, IntWritable.class, TextOutputFormat.class, conf)
//开始运行
ssc.start()
//计算完毕退出
ssc.awaitTermination()
sc.stop()
}
}