import org.apache.spark..{DecisionTreeClassifier, DecisionTreeClassificationModel}
import .{MulticlassClassificationEvaluator, BinaryClassificationEvaluator}
import
import .{ParamGridBuilder, CrossValidator}
import
import ._
// 创建SparkSession
val spark = ()
.appName(“BilibiliAnalysis”)
.config(“”, “local”)
.getOrCreate()
// 读取CSV文件并选择所需的列
val filePath = “file:///usr/local/hadoop/”
val df = (“header”, “true”).csv(filePath)
// 将所有字段的数据类型转换为整型并处理空值
val convertedDF = (
col(“Views”).cast(“int”),
col(“Danmaku_Count”).cast(“int”),
col(“Comment_Count”).cast(“int”),
col(“Favorite_Count”).cast(“int”),
col(“Coin_Count”).cast(“int”),
col(“Share_Count”).cast(“int”),
col(“Like_Count”).cast(“int”),
col(“Partition_Ranking”).cast(“int”)
).(0) // 填充空值为0
// 创建新的标签列
val labeledDF = (“label”, when(col(“Partition_Ranking”) <= 10, 1).otherwise(0))
// 使用VectorAssembler将特征列转化为特征向量
val featureCols = Array(“Views”, “Danmaku_Count”, “Comment_Count”, “Favorite_Count”, “Coin_Count”, “Share_Count”, “Like_Count”)
val assembler = new VectorAssembler()
.setInputCols(featureCols)
.setOutputCol(“features”)
val assembledDF = (labeledDF)
// 划分训练集和验证集
val Array(trainData, testData) = (Array(0.8, 0.2), seed = 1234)
// 实例化决策树分类器
val dt = new DecisionTreeClassifier()
.setLabelCol(“label”)
.setFeaturesCol(“features”)
// 设置超参数网格
val paramGrid = new ParamGridBuilder()
.addGrid(, Array(5, 10, 15))
.addGrid(, Array(16, 32, 64))
.build()
// 交叉验证
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol(“label”)
.setPredictionCol(“prediction”)
.setMetricName(“accuracy”)
val cv = new CrossValidator()
.setEstimator(dt)
.setEvaluator(evaluator)
.setEstimatorParamMaps(())
.setNumFolds(5) // 设置交叉验证的折数
// 执行交叉验证,并选择最佳模型
val cvModel = (trainData)
// 对测试集进行预测
val predictions = (testData)
// 获取最佳模型
val bestModel = [DecisionTreeClassificationModel]
// 在测试数据集上进行预测
val predictions = (testData)
// 评估预测准确率
val accuracy = (predictions)
// 输出准确率
println("Test Accuracy: " + accuracy)
// 获取最佳模型
val bestModel = [DecisionTreeClassificationModel]
// 在验证集上进行预测
val predictions = (testData)
// 使用MulticlassClassificationEvaluator评估分类正确率
val multiEvaluator = new MulticlassClassificationEvaluator()
.setLabelCol(“label”)
.setPredictionCol(“prediction”)
.setMetricName(“accuracy”)
val accuracy = (predictions)
println("Multiclass Classification Accuracy: " + accuracy)
// 使用BinaryClassificationEvaluator评估AUC
val binaryEvaluator = new BinaryClassificationEvaluator()
.setLabelCol(“label”)
.setRawPredictionCol(“prediction”)
.setMetricName(“areaUnderROC”)
val auc = (predictions)
println("Binary Classification AUC: " + auc)
println(s"最佳模型参数:maxDepth = $bestMaxDepth, maxBins = $bestMaxBins")
// 关闭SparkSession
()