1.自定义 schema(Rdd[Row] => DataSet[Row])
import org.apache.spark.sql.types._
val peopleRDD = spark.sparkContext.textFile("README.md") val schemaString = "name age"
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields) val rowRDD = peopleRDD
.map(_.split(","))
.map(attributes => Row(attributes(0), attributes(1).trim))
rowRDD.collect().foreach(println)
val df = spark.createDataFrame(rowRDD, schema)
2.借助 case class 隐式转换(Rdd[Person] => DataSet[Row])
object DFTest { case class Person(name: String, age: Int) def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("DataFrame Application").
master("local")
.getOrCreate()
import spark.implicits._
val peopleRDD = spark.sparkContext.textFile("README.md") val personRDD = peopleRDD
.map(_.split(","))
.map(attributes => Person(attributes(0), attributes(1).toInt))
personRDD.collect().foreach(println)
personRDD.toDF().show()
}
}
3.直接从数据源创建
val df = spark
.read
.option("header", value = true)
.csv("/home/lg/Documents/data/1987.csv")
此外
spark.read.jdbc
spark.read.json
spark.read.parquet
233