Spark 机器学习------逻辑回归

package Spark_MLlib

import javassist.bytecode.SignatureAttribute.ArrayType

import org.apache.spark.sql.SparkSession

import org.apache.spark.ml.{Pipeline, PipelineModel}

import org.apache.spark.ml.classification.LogisticRegression

import org.apache.spark.ml.feature.{HashingTF, Tokenizer}

import org.apache.spark.ml.linalg.Vector

import org.apache.spark.sql.Row

/**

  * Spark逻辑回归的库

  * http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.package
*/

object 逻辑回归 {

      val spark=SparkSession.builder().master("local[2]").appName("逻辑回归").getOrCreate()

      import spark.implicits._

  def main(args: Array[String]): Unit = {

      val training = spark.createDataFrame(Seq((,"soyo spark soyo1",1.0),(,"hadoop spark",1.0),(,"zhouhang xiaohai",0.0),(,"hbase spark hive soyo",1.0))).

        toDF("id","text","label")

      //转换器

       val tokenizer=new Tokenizer().setInputCol("text").setOutputCol("words")

       val hashingTF=new HashingTF().setNumFeatures().setInputCol(tokenizer.getOutputCol).setOutputCol("features")

       //评估器

       val lr= new LogisticRegression().setMaxIter(). //设置最大迭代次数

         setRegParam(0.01) // 设置正则化参数

       val pipeline= new Pipeline().setStages(Array(tokenizer,hashingTF,lr))

       //训练出的模型

       val model=pipeline.fit(training)

       //测试数据

       val test= spark.createDataFrame(Seq((,"spark i like"),(,"hadoop spark book"),(,"soyo9 soy 88"))).toDF("id","text")

          test.show()

//           test.createOrReplaceTempView("soyo")

//           spark.sql("").show()

       model.transform(test).schema.foreach(println)

           model.transform(test)

             .select("id","text","probability","prediction")

             .collect()

             .foreach { case Row(id: Int, text: String, prob: Vector, prediction: Double) =>

                 println(s"($id,$text)----->prob=$prob,prediction=$prediction")

               }

       //转换器生成的一些中间数据

    model.transform(test).select("id","text","features","rawPrediction")

            .collect()

               .foreach{

                 case Row(id:Int,text:String,features:Vector,rawPrediction:Vector)=>

                   println(s"id=$id,text=$text,features=$features,rawPrediction=$rawPrediction")

               }

    spark.stop()

  }

}

结果：

+---+-----------------+
| id|             text|
+---+-----------------+
| 4|     spark i like|
| 5|hadoop spark book|
| 6|     soyo9 soy 88|
+---+-----------------+

StructField(id,IntegerType,false)
StructField(text,StringType,true)
StructField(words,ArrayType(StringType,true),true)
StructField(features,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)
StructField(rawPrediction,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)
StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)
StructField(prediction,DoubleType,true)
(4,spark i like)----->prob=[0.033501882964501836,0.9664981170354981],prediction=1.0                                准确率
(5,hadoop spark book)----->prob=[0.011175823696937707,0.9888241763030623],prediction=1.0                  准确率
(6,soyo9 soy 88)----->prob=[0.26222944363302514,0.7377705563669748],prediction=1.0                              准确率（误判了）但值较低
id=4,text=spark i like,features=(1000,[105,329,330],[1.0,1.0,1.0]),rawPrediction=[-3.3620777052692805,3.3620777052692805]
id=5,text=hadoop spark book,features=(1000,[105,181,393],[1.0,1.0,1.0]),rawPrediction=[-4.482763689867715,4.482763689867715]
id=6,text=soyo9 soy 88,features=(1000,[543,602,976],[1.0,1.0,1.0]),rawPrediction=[-1.0344130174468225,1.0344130174468225]

秒客网

Spark 机器学习------逻辑回归

相关文章