目录
总述
01-10天,基本语法
11-20天,线性数据结构
21-30天,树与二叉树
31-40天,图
41-50天,查找与排序
51-60天,kNN 与 NB
61-70天,决策树与集成学习
71-80天,BP 神经网络
81-90天,CNN 卷积神经网络
示例数据下载地址
/fansmale/javasampledata
参见 Weka 中的数据表基本管理.
以及 luv_x_c 对本贴的评论.
第 51 天: kNN 分类器
kNN 的原始论文为: T. Cover and P. Hart. Nearest neighbor pattern classification. IEEE Transactions in Information Theory, IT-13, pages 21–27, 1967.
这个代码 300 行, 分三天完成. 今天先把代码抄完并运行, 明后天有修改程序的工作. 要求熟练掌握.
kNN 的特点:
- 简单. 没有学习过程, 也被称为惰性学习 lazy learning. 类似于开卷考试, 在已有数据中去找答案.
- 本源. 找相似, 正是人类认识事物的常用方法, 隐藏于人类或者其他动物的基因里面. 当然, 人类也会上当, 例如有人把邻居的滴水观音误认为是芋头, 偷食后中毒.
- 效果好. 永远不要小视 kNN, 对于很多数据, 你很难设计算法超越它.
- 适应性强. 可用于分类, 回归. 可用于各种数据.
- 可扩展性强. 设计不同的度量, 可获得意想不到的效果.
- 一般需要对数据归一化.
- 复杂度高. 这也是 kNN 最重要的缺点. 对于每一个测试数据, 复杂度为 O ( ( m + k ) n ) O((m+k)n) O((m+k)n), 其中 n n n 为训练数据个数, m m m 为条件属性个数, k k k 为邻居个数. 代码见 computeNearests().
代码说明:
8. 两种距离度量.
9. 数据随机分割方式.
10. 间址的灵活使用: trainingSet 和 testingSet 都是整数数组, 表示下标.
11. arff 文件的读取. 需要 包.
12. 求邻居.
13. 投票.
package ;
import ;
import ;
import ;
import .*;
/**
* kNN classification.
*
* @author Fan Min minfanphd@.
*/
public class KnnClassification {
/**
* Manhattan distance.
*/
public static final int MANHATTAN = 0;
/**
* Euclidean distance.
*/
public static final int EUCLIDEAN = 1;
/**
* The distance measure.
*/
public int distanceMeasure = EUCLIDEAN;
/**
* A random instance;
*/
public static final Random random = new Random();
/**
* The number of neighbors.
*/
int numNeighbors = 7;
/**
* The whole dataset.
*/
Instances dataset;
/**
* The training set. Represented by the indices of the data.
*/
int[] trainingSet;
/**
* The testing set. Represented by the indices of the data.
*/
int[] testingSet;
/**
* The predictions.
*/
int[] predictions;
/**
*********************
* The first constructor.
*
* @param paraFilename
* The arff filename.
*********************
*/
public KnnClassification(String paraFilename) {
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
// The last attribute is the decision class.
(() - 1);
();
} catch (Exception ee) {
("Error occurred while trying to read \'" + paraFilename
+ "\' in KnnClassification constructor.\r\n" + ee);
(0);
} // Of try
}// Of the first constructor
/**
*********************
* Get a random indices for data randomization.
*
* @param paraLength
* The length of the sequence.
* @return An array of indices, ., {4, 3, 1, 5, 0, 2} with length 6.
*********************
*/
public static int[] getRandomIndices(int paraLength) {
int[] resultIndices = new int[paraLength];
// Step 1. Initialize.
for (int i = 0; i < paraLength; i++) {
resultIndices[i] = i;
} // Of for i
// Step 2. Randomly swap.
int tempFirst, tempSecond, tempValue;
for (int i = 0; i < paraLength; i++) {
// Generate two random indices.
tempFirst = (paraLength);
tempSecond = (paraLength);
// Swap.
tempValue = resultIndices[tempFirst];
resultIndices[tempFirst] = resultIndices[tempSecond];
resultIndices[tempSecond] = tempValue;
} // Of for i
return resultIndices;
}// Of getRandomIndices
/**
*********************
* Split the data into training and testing parts.
*
* @param paraTrainingFraction
* The fraction of the training set.
*********************
*/
public void splitTrainingTesting(double paraTrainingFraction) {
int tempSize = ();
int[] tempIndices = getRandomIndices(tempSize);
int tempTrainingSize = (int) (tempSize * paraTrainingFraction);
trainingSet = new int[tempTrainingSize];
testingSet = new int[tempSize - tempTrainingSize];
for (int i = 0; i < tempTrainingSize; i++) {
trainingSet[i] = tempIndices[i];
} // Of for i
for (int i = 0; i < tempSize - tempTrainingSize; i++) {
testingSet[i] = tempIndices[tempTrainingSize + i];
} // Of for i
}// Of splitTrainingTesting
/**
*********************
* Predict for the whole testing set. The results are stored in predictions.
* #see predictions.
*********************
*/
public void predict() {
predictions = new int[];
for (int i = 0; i < ; i++) {
predictions[i] = predict(testingSet[i]);
} // Of for i
}// Of predict
/**
*********************
* Predict for given instance.
*
* @return The prediction.
*********************
*/
public int predict(int paraIndex) {
int[] tempNeighbors = computeNearests(paraIndex);
int resultPrediction = simpleVoting(tempNeighbors);
return resultPrediction;
}// Of predict
/**
*********************
* The distance between two instances.
*
* @param paraI
* The index of the first instance.
* @param paraJ
* The index of the second instance.
* @return The distance.
*********************
*/
public double distance(int paraI, int paraJ) {
double resultDistance = 0;
double tempDifference;
switch (distanceMeasure) {
case MANHATTAN:
for (int i = 0; i < () - 1; i++) {
tempDifference = (paraI).value(i) - (paraJ).value(i);
if (tempDifference < 0) {
resultDistance -= tempDifference;
} else {
resultDistance += tempDifference;
} // Of if
} // Of for i
break;
case EUCLIDEAN:
for (int i = 0; i < () - 1; i++) {
tempDifference = (paraI).value(i) - (paraJ).value(i);
resultDistance += tempDifference * tempDifference;
} // Of for i
break;
default:
("Unsupported distance measure: " + distanceMeasure);
}// Of switch
return resultDistance;
}// Of distance
/**
*********************
* Get the accuracy of the classifier.
*
* @return The accuracy.
*********************
*/
public double getAccuracy() {
// A double divides an int gets another double.
double tempCorrect = 0;
for (int i = 0; i < ; i++) {
if (predictions[i] == (testingSet[i]).classValue()) {
tempCorrect++;
} // Of if
} // Of for i
return tempCorrect / ;
}// Of getAccuracy
/**
************************************
* Compute the nearest k neighbors. Select one neighbor in each scan. In
* fact we can scan only once. You may implement it by yourself.
*
* @param paraK
* the k value for kNN.
* @param paraCurrent
* current instance. We are comparing it with all others.
* @return the indices of the nearest instances.
************************************
*/
public int[] computeNearests(int paraCurrent) {
int[] resultNearests = new int[numNeighbors];
boolean[] tempSelected = new boolean[];
double tempMinimalDistance;
int tempMinimalIndex = 0;
// Compute all distances to avoid redundant computation.
double[] tempDistances = new double[];
for (int i = 0; i < ; i ++) {
tempDistances[i] = distance(paraCurrent, trainingSet[i]);
}//Of for i
// Select the nearest paraK indices.
for (int i = 0; i < numNeighbors; i++) {
tempMinimalDistance = Double.MAX_VALUE;
for (int j = 0; j < ; j++) {
if (tempSelected[j]) {
continue;
} // Of if
if (tempDistances[j] < tempMinimalDistance) {
tempMinimalDistance = tempDistances[j];
tempMinimalIndex = j;
} // Of if
} // Of for j
resultNearests[i] = trainingSet[tempMinimalIndex];
tempSelected[tempMinimalIndex] = true;
} // Of for i
("The nearest of " + paraCurrent + " are: " + (resultNearests));
return resultNearests;
}// Of computeNearests
/**
************************************
* Voting using the instances.
*
* @param paraNeighbors
* The indices of the neighbors.
* @return The predicted label.
************************************
*/
public int simpleVoting(int[] paraNeighbors) {
int[] tempVotes = new int[()];
for (int i = 0; i < ; i++) {
tempVotes[(int) (paraNeighbors[i]).classValue()]++;
} // Of for i
int tempMaximalVotingIndex = 0;
int tempMaximalVoting = 0;
for (int i = 0; i < (); i++) {
if (tempVotes[i] > tempMaximalVoting) {
tempMaximalVoting = tempVotes[i];
tempMaximalVotingIndex = i;
} // Of if
} // Of for i
return tempMaximalVotingIndex;
}// Of simpleVoting
/**
*********************
* The entrance of the program.
*
* @param args
* Not used now.
*********************
*/
public static void main(String args[]) {
KnnClassification tempClassifier = new KnnClassification("D:/data/");
(0.8);
();
("The accuracy of the classifier is: " + ());
}// Of main
}// Of class KnnClassification
在 /FanSmale/sampledata/ 可下载 . 万一访问不畅, 把下面的内容拷贝另存成 即可.
@RELATION iris
@ATTRIBUTE sepallength REAL
@ATTRIBUTE sepalwidth REAL
@ATTRIBUTE petallength REAL
@ATTRIBUTE petalwidth REAL
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
@DATA
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.4,3.7,1.5,0.2,Iris-setosa
4.8,3.4,1.6,0.2,Iris-setosa
4.8,3.0,1.4,0.1,Iris-setosa
4.3,3.0,1.1,0.1,Iris-setosa
5.8,4.0,1.2,0.2,Iris-setosa
5.7,4.4,1.5,0.4,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
5.7,3.8,1.7,0.3,Iris-setosa
5.1,3.8,1.5,0.3,Iris-setosa
5.4,3.4,1.7,0.2,Iris-setosa
5.1,3.7,1.5,0.4,Iris-setosa
4.6,3.6,1.0,0.2,Iris-setosa
5.1,3.3,1.7,0.5,Iris-setosa
4.8,3.4,1.9,0.2,Iris-setosa
5.0,3.0,1.6,0.2,Iris-setosa
5.0,3.4,1.6,0.4,Iris-setosa
5.2,3.5,1.5,0.2,Iris-setosa
5.2,3.4,1.4,0.2,Iris-setosa
4.7,3.2,1.6,0.2,Iris-setosa
4.8,3.1,1.6,0.2,Iris-setosa
5.4,3.4,1.5,0.4,Iris-setosa
5.2,4.1,1.5,0.1,Iris-setosa
5.5,4.2,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
5.0,3.2,1.2,0.2,Iris-setosa
5.5,3.5,1.3,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
4.4,3.0,1.3,0.2,Iris-setosa
5.1,3.4,1.5,0.2,Iris-setosa
5.0,3.5,1.3,0.3,Iris-setosa
4.5,2.3,1.3,0.3,Iris-setosa
4.4,3.2,1.3,0.2,Iris-setosa
5.0,3.5,1.6,0.6,Iris-setosa
5.1,3.8,1.9,0.4,Iris-setosa
4.8,3.0,1.4,0.3,Iris-setosa
5.1,3.8,1.6,0.2,Iris-setosa
4.6,3.2,1.4,0.2,Iris-setosa
5.3,3.7,1.5,0.2,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
6.4,3.2,4.5,1.5,Iris-versicolor
6.9,3.1,4.9,1.5,Iris-versicolor
5.5,2.3,4.0,1.3,Iris-versicolor
6.5,2.8,4.6,1.5,Iris-versicolor
5.7,2.8,4.5,1.3,Iris-versicolor
6.3,3.3,4.7,1.6,Iris-versicolor
4.9,2.4,3.3,1.0,Iris-versicolor
6.6,2.9,4.6,1.3,Iris-versicolor
5.2,2.7,3.9,1.4,Iris-versicolor
5.0,2.0,3.5,1.0,Iris-versicolor
5.9,3.0,4.2,1.5,Iris-versicolor
6.0,2.2,4.0,1.0,Iris-versicolor
6.1,2.9,4.7,1.4,Iris-versicolor
5.6,2.9,3.6,1.3,Iris-versicolor
6.7,3.1,4.4,1.4,Iris-versicolor
5.6,3.0,4.5,1.5,Iris-versicolor
5.8,2.7,4.1,1.0,Iris-versicolor
6.2,2.2,4.5,1.5,Iris-versicolor
5.6,2.5,3.9,1.1,Iris-versicolor
5.9,3.2,4.8,1.8,Iris-versicolor
6.1,2.8,4.0,1.3,Iris-versicolor
6.3,2.5,4.9,1.5,Iris-versicolor
6.1,2.8,4.7,1.2,Iris-versicolor
6.4,2.9,4.3,1.3,Iris-versicolor
6.6,3.0,4.4,1.4,Iris-versicolor
6.8,2.8,4.8,1.4,Iris-versicolor
6.7,3.0,5.0,1.7,Iris-versicolor
6.0,2.9,4.5,1.5,Iris-versicolor
5.7,2.6,3.5,1.0,Iris-versicolor
5.5,2.4,3.8,1.1,Iris-versicolor
5.5,2.4,3.7,1.0,Iris-versicolor
5.8,2.7,3.9,1.2,Iris-versicolor
6.0,2.7,5.1,1.6,Iris-versicolor
5.4,3.0,4.5,1.5,Iris-versicolor
6.0,3.4,4.5,1.6,Iris-versicolor
6.7,3.1,4.7,1.5,Iris-versicolor
6.3,2.3,4.4,1.3,Iris-versicolor
5.6,3.0,4.1,1.3,Iris-versicolor
5.5,2.5,4.0,1.3,Iris-versicolor
5.5,2.6,4.4,1.2,Iris-versicolor
6.1,3.0,4.6,1.4,Iris-versicolor
5.8,2.6,4.0,1.2,Iris-versicolor
5.0,2.3,3.3,1.0,Iris-versicolor
5.6,2.7,4.2,1.3,Iris-versicolor
5.7,3.0,4.2,1.2,Iris-versicolor
5.7,2.9,4.2,1.3,Iris-versicolor
6.2,2.9,4.3,1.3,Iris-versicolor
5.1,2.5,3.0,1.1,Iris-versicolor
5.7,2.8,4.1,1.3,Iris-versicolor
6.3,3.3,6.0,2.5,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
7.1,3.0,5.9,2.1,Iris-virginica
6.3,2.9,5.6,1.8,Iris-virginica
6.5,3.0,5.8,2.2,Iris-virginica
7.6,3.0,6.6,2.1,Iris-virginica
4.9,2.5,4.5,1.7,Iris-virginica
7.3,2.9,6.3,1.8,Iris-virginica
6.7,2.5,5.8,1.8,Iris-virginica
7.2,3.6,6.1,2.5,Iris-virginica
6.5,3.2,5.1,2.0,Iris-virginica
6.4,2.7,5.3,1.9,Iris-virginica
6.8,3.0,5.5,2.1,Iris-virginica
5.7,2.5,5.0,2.0,Iris-virginica
5.8,2.8,5.1,2.4,Iris-virginica
6.4,3.2,5.3,2.3,Iris-virginica
6.5,3.0,5.5,1.8,Iris-virginica
7.7,3.8,6.7,2.2,Iris-virginica
7.7,2.6,6.9,2.3,Iris-virginica
6.0,2.2,5.0,1.5,Iris-virginica
6.9,3.2,5.7,2.3,Iris-virginica
5.6,2.8,4.9,2.0,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
6.3,2.7,4.9,1.8,Iris-virginica
6.7,3.3,5.7,2.1,Iris-virginica
7.2,3.2,6.0,1.8,Iris-virginica
6.2,2.8,4.8,1.8,Iris-virginica
6.1,3.0,4.9,1.8,Iris-virginica
6.4,2.8,5.6,2.1,Iris-virginica
7.2,3.0,5.8,1.6,Iris-virginica
7.4,2.8,6.1,1.9,Iris-virginica
7.9,3.8,6.4,2.0,Iris-virginica
6.4,2.8,5.6,2.2,Iris-virginica
6.3,2.8,5.1,1.5,Iris-virginica
6.1,2.6,5.6,1.4,Iris-virginica
7.7,3.0,6.1,2.3,Iris-virginica
6.3,3.4,5.6,2.4,Iris-virginica
6.4,3.1,5.5,1.8,Iris-virginica
6.0,3.0,4.8,1.8,Iris-virginica
6.9,3.1,5.4,2.1,Iris-virginica
6.7,3.1,5.6,2.4,Iris-virginica
6.9,3.1,5.1,2.3,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
6.8,3.2,5.9,2.3,Iris-virginica
6.7,3.3,5.7,2.5,Iris-virginica
6.7,3.0,5.2,2.3,Iris-virginica
6.3,2.5,5.0,1.9,Iris-virginica
6.5,3.0,5.2,2.0,Iris-virginica
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica
第 52 天: kNN 分类器 (续)
- 重新实现 computeNearests, 仅需要扫描一遍训练集, 即可获得 k k k 个邻居. 提示: 现代码与插入排序思想相结合. 其时间复杂度为 O ( k n ) O(kn) O(kn), 其中 O ( n ) O(n) O(n) 用于扫描训练集, O ( k ) O(k) O(k) 用于插入.
- 增加 setDistanceMeasure() 方法.
- 增加 setNumNeighors() 方法.
第 53 天: kNN 分类器 (续)
- 增加 weightedVoting() 方法, 距离越短话语权越大. 支持两种以上的加权方式.
- 实现 leave-one-out 测试.
第 54 天: 基于 M-distance 的推荐
这里夹带一点私货, 即论文 Mei Zheng, Fan Min, Heng-Ru Zhang, Wen-Bin Chen, Fast recommendations with the M-distance, IEEE Access 4 (2016) 1464–1468 的源代码. 点击下载论文.
- 评分表 (用户, 项目, 评分) 的压缩方式给出. 见 /FanSmale/sampledata/ 中 .
前几行数据为:
0,0,5
0,1,3
0,2,4
0,3,3
0,4,3
0,5,5
0,6,4
…
1,0,4
1,9,2
1,12,4
其中, “0,2,4” 表示用户 0 对项目 2 的评分为 4. 用户 1 对项目 1、2 等的评分没有, 表示没看过该电影. 在用户数、项目数很多时, 必须使用压缩存储. - 一篇论文的代码就这么一点点. 当然, 这篇论文本身很简单. 所谓 M-distance, 就是根据平均分来计算两个用户 (或项目) 之间的距离.
炫一下数学表达式. 令项目 j j j 的平均分为 x ⋅ j x_{\cdot j} x⋅j,
采用 item-based recommendation, 则第 j j j 个项目关于第 i i i 个用户的邻居项目集合为
N i j = { 1 ≤ j ′ ≤ m ∣ j ′ ≠ j , p i j ′ ≠ 0 , ∣ r ⋅ j ‾ − r ⋅ j ′ ‾ ∣ < ϵ } (1) N_{ij} = \{1 \leq j' \leq m | j' \neq j, p_{ij'} \neq 0, |\overline{r_{\cdot j}} - \overline{r_{\cdot j'}}| < \epsilon\} \tag{1} Nij={1≤j′≤m∣j′=j,pij′=0,∣r⋅j−r⋅j′∣<ϵ}(1)
第 i i i 个用户对 j j j 个项目的评分预测为:
p i j = ∑ j ′ ∈ N i j r i j ′ ∣ N i j ∣ (2) p_{ij} = \frac{\sum_{j' \in N_{ij}} r_{ij'}}{|N_{ij}|} \tag{2} pij=∣Nij∣∑j′∈Nijrij′(2) - 邻居不用 k k k 控制. 距离小于 radius (即 ϵ \epsilon ϵ) 的都是邻居. 使用 M-distance 时, 这种方式效果更好.
- 使用 leave-one-out 的测试方式, 很高效的算法才能使用这种方式.
package ;
/**
* Recommendation with M-distance.
* @author Fan Min minfanphd@.
*/
import .*;
public class MBR {
/**
* Default rating for 1-5 points.
*/
public static final double DEFAULT_RATING = 3.0;
/**
* The total number of users.
*/
private int numUsers;
/**
* The total number of items.
*/
private int numItems;
/**
* The total number of ratings (non-zero values)
*/
private int numRatings;
/**
* The predictions.
*/
private double[] predictions;
/**
* Compressed rating matrix. User-item-rating triples.
*/
private int[][] compressedRatingMatrix;
/**
* The degree of users (how many item he has rated).
*/
private int[] userDegrees;
/**
* The average rating of the current user.
*/
private double[] userAverageRatings;
/**
* The degree of users (how many item he has rated).
*/
private int[] itemDegrees;
/**
* The average rating of the current item.
*/
private double[] itemAverageRatings;
/**
* The first user start from 0. Let the first user has x ratings, the second
* user will start from x.
*/
private int[] userStartingIndices;
/**
* Number of non-neighbor objects.
*/
private int numNonNeighbors;
/**
* The radius (delta) for determining the neighborhood.
*/
private double radius;
/**
*************************
* Construct the rating matrix.
*
* @param paraRatingFilename
* the rating filename.
* @param paraNumUsers
* number of users
* @param paraNumItems
* number of items
* @param paraNumRatings
* number of ratings
*************************
*/
public MBR(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception {
// Step 1. Initialize these arrays
numItems = paraNumItems;
numUsers = paraNumUsers;
numRatings = paraNumRatings;
userDegrees = new int[numUsers];
userStartingIndices = new int[numUsers + 1];
userAverageRatings = new double[numUsers];
itemDegrees = new int[numItems];
compressedRatingMatrix = new int[numRatings][3];
itemAverageRatings = new double[numItems];
predictions = new double[numRatings];
("Reading " + paraFilename);
// Step 2. Read the data file.
File tempFile = new File(paraFilename);
if (!()) {
("File " + paraFilename + " does not exists.");
(0);
} // Of if
BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile));
String tempString;
String[] tempStrArray;
int tempIndex = 0;
userStartingIndices[0] = 0;
userStartingIndices[numUsers] = numRatings;
while ((tempString = ()) != null) {
// Each line has three values
tempStrArray = (",");
compressedRatingMatrix[tempIndex][0] = (tempStrArray[0]);
compressedRatingMatrix[tempIndex][1] = (tempStrArray[1]);
compressedRatingMatrix[tempIndex][2] = (tempStrArray[2]);
userDegrees[compressedRatingMatrix[tempIndex][0]]++;
itemDegrees[compressedRatingMatrix[tempIndex][1]]++;
if (tempIndex > 0) {
// Starting to read the data of a new user.
if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) {
userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex;
} // Of if
} // Of if
tempIndex++;
} // Of while
();
double[] tempUserTotalScore = new double[numUsers];
double[] tempItemTotalScore = new double[numItems];
for (int i = 0; i < numRatings; i++) {
tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2];
tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2];
} // Of for i
for (int i = 0; i < numUsers; i++) {
userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i];
} // Of for i
for (int i = 0; i < numItems; i++) {
itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i];
} // Of for i
}// Of the first constructor
/**
*************************
* Set the radius (delta).
*
* @param paraRadius
* The given radius.
*************************
*/
public void setRadius(double paraRadius) {
if (paraRadius > 0) {
radius = paraRadius;
} else {
radius = 0.1;
} // Of if
}// Of setRadius
/**
*************************
* Leave-one-out prediction. The predicted values are stored in predictions.
*
* @see predictions
*************************
*/
public void leaveOneOutPrediction() {
double tempItemAverageRating;
// Make each line of the code shorter.
int tempUser, tempItem, tempRating;
("\r\nLeaveOneOutPrediction for radius " + radius);
numNonNeighbors = 0;
for (int i = 0; i < numRatings; i++) {
tempUser = compressedRatingMatrix[i][0];
tempItem = compressedRatingMatrix[i][1];
tempRating = compressedRatingMatrix[i][2];
// Step 1. Recompute average rating of the current item.
tempItemAverageRating = (itemAverageRatings[tempItem] * itemDegrees[tempItem] - tempRating)
/ (itemDegrees[tempItem] - 1);
// Step 2. Recompute neighbors, at the same time obtain the ratings
// Of neighbors.
int tempNeighbors = 0;
double tempTotal = 0;
int tempComparedItem;
for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) {
tempComparedItem = compressedRatingMatrix[j][1];
if (tempItem == tempComparedItem) {
continue;// Ignore itself.
} // Of if
if ((tempItemAverageRating - itemAverageRatings[tempComparedItem]) < radius) {
tempTotal += compressedRatingMatrix[j][2];
tempNeighbors++;
} // Of if
} // Of for j
// Step 3. Predict as the average value of neighbors.
if (tempNeighbors > 0) {
predictions[i] = tempTotal / tempNeighbors;
} else {
predictions[i] = DEFAULT_RATING;
numNonNeighbors++;
} // Of if
} // Of for i
}// Of leaveOneOutPrediction
/**
*************************
* Compute the MAE based on the deviation of each leave-one-out.
*
* @author Fan Min
*************************
*/
public double computeMAE() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < ; i++) {
tempTotalError += (predictions[i] - compressedRatingMatrix[i][2]);
} // Of for i
return tempTotalError / ;
}// Of computeMAE
/**
*************************
* Compute the MAE based on the deviation of each leave-one-out.
*
* @author Fan Min
*************************
*/
public double computeRSME() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < ; i++) {
tempTotalError += (predictions[i] - compressedRatingMatrix[i][2])
* (predictions[i] - compressedRatingMatrix[i][2]);
} // Of for i
double tempAverage = tempTotalError / ;
return (tempAverage);
}// Of computeRSME
/**
*************************
* The entrance of the program.
*
* @param args
* Not used now.
*************************
*/
public static void main(String[] args) {
try {
MBR tempRecommender = new MBR("D:/data/", 943, 1682, 100000);
for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) {
(tempRadius);
();
double tempMAE = ();
double tempRSME = ();
("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME
+ ", numNonNeighbors = " + );
} // Of for tempRadius
} catch (Exception ee) {
(ee);
} // Of try
}// Of main
}// Of class MBR
第 55 天: 基于 M-distance 的推荐 (续)
昨天实现的是 item-based recommendation. 今天自己来实现一下 user-based recommendation. 只需要在原有基础上增加即可.
提示: 数据是按照用户优先存放的, 因此 item-based recommendation 时,
j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++
就可将 tempUser 的所有评分信息读入. 然而, user-based recommendation 没有这样的便利. 为解决该问题, 可以有两种方案:
- 将压缩矩阵转置, 用户与项目关系互换. 这种方案要增加相应的代码, 但复杂度低. 推荐使用.
- 扫描时不仅仅是连续的数据, 而是需要整个数据集. 这种方案实现简单, 但复杂度高.
第 56 天: kMeans 聚类
kMeans 是最常用的聚类算法.
- kMeans 聚类需要中心点收敛时结束. 偷懒使用了 ()
- 数据集为 iris, 所以最后一个属性没使用. 如果对于没有决策属性的数据集, 需要进行相应修改.
- 数据没有归一化.
- getRandomIndices() 和 kMeans 的完全相同, 拷贝过来. 本来应该写在 里面的, 代码不多, 为保证独立性就放这里了.
- distance() 和 kMeans 的相似, 注意不要用决策属性, 而且参数不同. 第 2 个参数为实数向量, 这是类为中心可能为虚拟的, 而中心点那里并没有对象.
package ;
import ;
import ;
import ;
import ;
/**
* kMeans clustering.
* @author Fan Min minfanphd@.
*/
public class KMeans {
/**
* Manhattan distance.
*/
public static final int MANHATTAN = 0;
/**
* Euclidean distance.
*/
public static final int EUCLIDEAN = 1;
/**
* The distance measure.
*/
public int distanceMeasure = EUCLIDEAN;
/**
* A random instance;
*/
public static final Random random = new Random();
/**
* The data.
*/
Instances dataset;
/**
* The number of clusters.
*/
int numClusters = 2;
/**
* The clusters.
*/
int[][] clusters;
/**
*******************************
* The first constructor.
*
* @param paraFilename
* The data filename.
*******************************
*/
public KMeans(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
();
} catch (Exception ee) {
("Cannot read the file: " + paraFilename + "\r\n" + ee);
(0);
} // Of try
}// Of the first constructor
/**
*******************************
* A setter.
*******************************
*/
public void setNumClusters(int paraNumClusters) {
numClusters = paraNumClusters;
}// Of the setter
/**
*********************
* Get a random indices for data randomization.
*
* @param paraLength
* The length of the sequence.
* @return An array of indices, ., {4, 3, 1, 5, 0, 2} with length 6.
*********************
*/
public static int[] getRandomIndices(int paraLength) {
int[] resultIndices = new int[paraLength];
// Step 1. Initialize.
for (int i = 0; i < paraLength; i++) {
resultIndices[i] = i;
} // Of for i
// Step 2. Randomly swap.
int tempFirst, tempSecond, tempValue;
for (int i = 0; i < paraLength; i++) {
// Generate two random indices.
tempFirst = (paraLength);
tempSecond = (paraLength);
// Swap.
tempValue = resultIndices[tempFirst];
resultIndices[tempFirst] = resultIndices[tempSecond];
resultIndices[tempSecond] = tempValue;
} // Of for i
return resultIndices;
}// Of getRandomIndices
/**
*********************
* The distance between two instances.
*
* @param paraI
* The index of the first instance.
* @param paraArray
* The array representing a point in the space.
* @return The distance.
*********************
*/
public double distance(int paraI, double[] paraArray) {
int resultDistance = 0;
double tempDifference;
switch (distanceMeasure) {
case MANHATTAN:
for (int i = 0; i < () - 1; i++) {
tempDifference = (paraI).value(i) - paraArray[i];
if (tempDifference < 0) {
resultDistance -= tempDifference;
} else {
resultDistance += tempDifference;
} // Of if
} // Of for i
break;
case EUCLIDEAN:
for (int i = 0; i < () - 1; i++) {
tempDifference = (paraI).value(i) - paraArray[i];
resultDistance += tempDifference * tempDifference;
} // Of for i
break;
default:
("Unsupported distance measure: " + distanceMeasure);
}// Of switch
return resultDistance;
}// Of distance
/**
*******************************
* Clustering.
*******************************
*/
public void clustering() {
int[] tempOldClusterArray = new int[()];
tempOldClusterArray[0] = -1;
int[] tempClusterArray = new int[()];
(tempClusterArray, 0);
double[][] tempCenters = new double[numClusters][() - 1];
// Step 1. Initialize centers.
int[] tempRandomOrders = getRandomIndices(());
for (int i = 0; i < numClusters; i++) {
for (int j = 0; j < tempCenters[0].length; j++) {
tempCenters[i][j] = (tempRandomOrders[i]).value(j);
} // Of for j
} // Of for i
int[] tempClusterLengths = null;
while (!(tempOldClusterArray, tempClusterArray)) {
("New loop ...");
tempOldClusterArray = tempClusterArray;
tempClusterArray = new int[()];
// Step 2.1 Minimization. Assign cluster to each instance.
int tempNearestCenter;
double tempNearestDistance;
double tempDistance;
for (int i = 0; i < (); i++) {
tempNearestCenter = -1;
tempNearestDistance = Double.MAX_VALUE;
for (int j = 0; j < numClusters; j++) {
tempDistance = distance(i, tempCenters[j]);
if (tempNearestDistance > tempDistance) {
tempNearestDistance = tempDistance;
tempNearestCenter = j;
} // Of if
} // Of for j
tempClusterArray[i] = tempNearestCenter;
} // Of for i
// Step 2.2 Mean. Find new centers.
tempClusterLengths = new int[numClusters];
(tempClusterLengths, 0);
double[][] tempNewCenters = new double[numClusters][() - 1];
// (tempNewCenters, 0);
for (int i = 0; i < (); i++) {
for (int j = 0; j < tempNewCenters[0].length; j++) {
tempNewCenters[tempClusterArray[i]][j] += (i).value(j);
} // Of for j
tempClusterLengths[tempClusterArray[i]]++;
} // Of for i
// Step 2.3 Now average
for (int i = 0; i < ; i++) {
for (int j = 0; j < tempNewCenters[0].length; j++) {
tempNewCenters[i][j] /= tempClusterLengths[i];
} // Of for j
} // Of for i
("Now the new centers are: " + (tempNewCenters));
tempCenters = tempNewCenters;
} // Of while
// Step 3. Form clusters.
clusters = new int[numClusters][];
int[] tempCounters = new int[numClusters];
for (int i = 0; i < numClusters; i++) {
clusters[i] = new int[tempClusterLengths[i]];
} // Of for i
for (int i = 0; i < ; i++) {
clusters[tempClusterArray[i]][tempCounters[tempClusterArray[i]]] = i;
tempCounters[tempClusterArray[i]]++;
} // Of for i
("The clusters are: " + (clusters));
}// Of clustering
/**
*******************************
* Clustering.
*******************************
*/
public static void testClustering() {
KMeans tempKMeans = new KMeans("D:/data/");
(3);
();
}// Of testClustering
/**
*************************
* A testing method.
*************************
*/
public static void main(String arags[]) {
testClustering();
}// Of main
}// Of class KMeans
第 57 天: kMeans 聚类 (续)
获得虚拟中心后, 换成与其最近的点作为实际中心, 再聚类.
今天主要是想控制下节奏. 毕竟 kMeans 也值得两天的工作量.
第 58 天: 符号型数据的 NB 算法
Naive Bayes 是一种用后验概率公式推导出的算法. 它有一个独立性假设, 从数学上看起来不靠谱. 但从机器学习效果来说是不错的. 写程序之前, 先点击NB 算法 (包括符号型与数值型, 结合 Java 程序分析)进行学习.
- 所有的程序都在今天列出, 但今天只研究符号型数据的分类. 为此, 可以只抄符号型数据相关的方法 (从 main() 顺藤摸瓜开始有选择性地抄), 明天再抄数值型数据处理算法. 421 行的代码仅仅是测试训练与测试集不同的情况, 没有必要抄.
- 必须自己举一个小的例子 (如 10 个对象, 3 个条件属性, 2 个类别) 来辅助理解.
- 需要查阅相关基础知识.
- 需要理解三维数组每个维度的涵义: The conditional probabilities for all classes over all attributes on all values. 注意到三维数组不是规则的, 例如, 不同属性的属性值个数可能不同.
- 这里使用同样的数据进行训练和测试. 如果要划分训练集和测试集, 可参考 kNN 代码.
- tempPseudoProbability 初始化为 0 就错了. 对于类平衡数据集没影响, 但不平衡的话效果就不对了. 在这个问题上输了 50 块钱, 害!
package ;
import ;
import ;
import ;
import .*;
/**
* The Naive Bayes algorithm.
*
* @author Fan Min minfanphd@.
*/
public class NaiveBayes {
/**
*************************
* An inner class to store parameters.
*************************
*/
private class GaussianParamters {
double mu;
double sigma;
public GaussianParamters(double paraMu, double paraSigma) {
mu = paraMu;
sigma = paraSigma;
}// Of the constructor
public String toString() {
return "(" + mu + ", " + sigma + ")";
}// Of toString
}// Of GaussianParamters
/**
* The data.
*/
Instances dataset;
/**
* The number of classes. For binary classification it is 2.
*/
int numClasses;
/**
* The number of instances.
*/
int numInstances;
/**
* The number of conditional attributes.
*/
int numConditions;
/**
* The prediction, including queried and predicted labels.
*/
int[] predicts;
/**
* Class distribution.
*/
double[] classDistribution;
/**
* Class distribution with Laplacian smooth.
*/
double[] classDistributionLaplacian;
/**
* To calculate the conditional probabilities for all classes over all
* attributes on all values.
*/
double[][][] conditionalCounts;
/**
* The conditional probabilities with Laplacian smooth.
*/
double[][][] conditionalProbabilitiesLaplacian;
/**
* The Guassian parameters.
*/
GaussianParamters[][] gaussianParameters;
/**
* Data type.
*/
int dataType;
/**
* Nominal.
*/
public static final int NOMINAL = 0;
/**
* Numerical.
*/
public static final int NUMERICAL = 1;
/**
********************
* The constructor.
*
* @param paraFilename
* The given file.
********************
*/
public NaiveBayes(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
();
} catch (Exception ee) {
("Cannot read the file: " + paraFilename + "\r\n" + ee);
(0);
} // Of try
(() - 1);
numConditions = () - 1;
numInstances = ();
numClasses = (numConditions).numValues();
}// Of the constructor
/**
********************
* The constructor.
*
* @param paraFilename
* The given file.
********************
*/
public NaiveBayes(Instances paraInstances) {
dataset = paraInstances;
(() - 1);
numConditions = () - 1;
numInstances = ();
numClasses = (numConditions).numValues();
}// Of the constructor
/**
********************
* Set the data type.
********************
*/
public void setDataType(int paraDataType) {
dataType = paraDataType;
}// Of setDataType
/**
********************
* Calculate the class distribution with Laplacian smooth.
********************
*/
public void calculateClassDistribution() {
classDistribution = new double[numClasses];
classDistributionLaplacian = new double[numClasses];
double[] tempCounts = new double[numClasses];
for (int i = 0; i < numInstances; i++) {
int tempClassValue = (int) (i).classValue();
tempCounts[tempClassValue]++;
} // Of for i
for (int i = 0; i < numClasses; i++) {
classDistribution[i] = tempCounts[i] / numInstances;
classDistributionLaplacian[i] = (tempCounts[i] + 1) / (numInstances + numClasses);
} // Of for i
("Class distribution: " + (classDistribution));
("Class distribution Laplacian: " + (classDistributionLaplacian));
}// Of calculateClassDistribution
/**
********************
* Calculate the conditional probabilities with Laplacian smooth. ONLY scan
* the dataset once. There was a simpler one, I have removed it because the
* time complexity is higher.
********************
*/
public void calculateConditionalProbabilities() {
conditionalCounts = new double[numClasses][numConditions][];
conditionalProbabilitiesLaplacian = new double[numClasses][numConditions][];
// Allocate space
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
int tempNumValues = (int) (j).numValues();
conditionalCounts[i][j] = new double[tempNumValues];
conditionalProbabilitiesLaplacian[i][j] = new double[tempNumValues];
} // Of for j
} // Of for i
// Count the numbers
int[] tempClassCounts = new int[numClasses];
for (int i = 0; i < numInstances; i++) {
int tempClass = (int) (i).classValue();
tempClassCounts[tempClass]++;
for (int j = 0; j < numConditions; j++) {
int tempValue = (int) (i).value(j);
conditionalCounts[tempClass][j][tempValue]++;
} // Of for j
} // Of for i
// Now for the real probability with Laplacian
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
int tempNumValues = (int) (j).numValues();
for (int k = 0; k < tempNumValues; k++) {
conditionalProbabilitiesLaplacian[i][j][k] = (conditionalCounts[i][j][k] + 1)
/ (tempClassCounts[i] + tempNumValues);
// I wrote a bug here. This is an alternative approach,
// however its performance is better in the mushroom dataset.
// conditionalProbabilitiesLaplacian[i][j][k] =
// (numInstances * conditionalCounts[i][j][k] + 1)
// / (numInstances * tempClassCounts[i] + tempNumValues);
} // Of for k
} // Of for j
} // Of for i
("Conditional probabilities: " + (conditionalCounts));
}// Of calculateConditionalProbabilities
/**
********************
* Calculate the conditional probabilities with Laplacian smooth.
********************
*/
public void calculateGausssianParameters() {
gaussianParameters = new GaussianParamters[numClasses][numConditions];
double[] tempValuesArray = new double[numInstances];
int tempNumValues = 0;
double tempSum = 0;
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
tempSum = 0;
// Obtain values for this class.
tempNumValues = 0;
for (int k = 0; k < numInstances; k++) {
if ((int) (k).classValue() != i) {
continue;
} // Of if
tempValuesArray[tempNumValues] = (k).value(j);
tempSum += tempValuesArray[tempNumValues];
tempNumValues++;
} // Of for k
// Obtain parameters.
double tempMu = tempSum / tempNumValues;
double tempSigma = 0;
for (int k = 0; k < tempNumValues; k++) {
tempSigma += (tempValuesArray[k] - tempMu) * (tempValuesArray[k] - tempMu);
} // Of for k
tempSigma /= tempNumValues;
tempSigma = (tempSigma);
gaussianParameters[i][j] = new GaussianParamters(tempMu, tempSigma);
} // Of for j
} // Of for i
((gaussianParameters));
}// Of calculateGausssianParameters
/**
********************
* Classify all instances, the results are stored in predicts[].
********************
*/
public void classify() {
predicts = new int[numInstances];
for (int i = 0; i < numInstances; i++) {
predicts[i] = classify((i));
} // Of for i
}// Of classify
/**
********************
* Classify an instances.
********************
*/
public int classify(Instance paraInstance) {
if (dataType == NOMINAL) {
return classifyNominal(paraInstance);
} else if (dataType == NUMERICAL) {
return classifyNumerical(paraInstance);
} // Of if
return -1;
}// Of classify
/**
********************
* Classify an instances with nominal data.
********************
*/
public int classifyNominal(Instance paraInstance) {
// Find the biggest one
double tempBiggest = -10000;
int resultBestIndex = 0;
for (int i = 0; i < numClasses; i++) {
double tempPseudoProbability = (classDistributionLaplacian[i]);
for (int j = 0; j < numConditions; j++) {
int tempAttributeValue = (int) (j);
tempPseudoProbability += (conditionalProbabilitiesLaplacian[i][j][tempAttributeValue]);
} // Of for j
if (tempBiggest < tempPseudoProbability) {
tempBiggest = tempPseudoProbability;
resultBestIndex = i;
} // Of if
} // Of for i
return resultBestIndex;
}// Of classifyNominal
/**
********************
* Classify an instances with numerical data.
********************
*/
public int classifyNumerical(Instance paraInstance) {
// Find the biggest one
double tempBiggest = -10000;
int resultBestIndex = 0;
for (int i = 0; i < numClasses; i++) {
double tempPseudoProbability = (classDistributionLaplacian[i]);
for (int j = 0; j < numConditions; j++) {
double tempAttributeValue = (j);
double tempSigma = gaussianParameters[i][j].sigma;
double tempMu = gaussianParameters[i][j].mu;
tempPseudoProbability += -(tempSigma)
- (tempAttributeValue - tempMu) * (tempAttributeValue - tempMu) / (2 * tempSigma * tempSigma);
} // Of for j
if (tempBiggest < tempPseudoProbability) {
tempBiggest = tempPseudoProbability;
resultBestIndex = i;
} // Of if
} // Of for i
return resultBestIndex;
}// Of classifyNumerical
/**
********************
* Compute accuracy.
********************
*/
public double computeAccuracy() {
double tempCorrect = 0;
for (int i = 0; i < numInstances; i++) {
if (predicts[i] == (int) (i).classValue()) {
tempCorrect++;
} // Of if
} // Of for i
double resultAccuracy = tempCorrect / numInstances;
return resultAccuracy;
}// Of computeAccuracy
/**
*************************
* Test nominal data.
*************************
*/
public static void testNominal() {
("Hello, Naive Bayes. I only want to test the nominal data.");
String tempFilename = "D:/data/";
NaiveBayes tempLearner = new NaiveBayes(tempFilename);
(NOMINAL);
();
();
();
("The accuracy is: " + ());
}// Of testNominal
/**
*************************
* Test numerical data.
*************************
*/
public static void testNumerical() {
("Hello, Naive Bayes. I only want to test the numerical data with Gaussian assumption.");
// String tempFilename = "D:/data/";
String tempFilename = "D:/data/";
NaiveBayes tempLearner = new NaiveBayes(tempFilename);
(NUMERICAL);
();
();
();
("The accuracy is: " + ());
}// Of testNumerical
/**
*************************
* Test this class.
*
* @param args
* Not used now.
*************************
*/
public static void main(String[] args) {
testNominal();
testNumerical();
// testNominal(0.8);
}// Of main
/**
*********************
* Get a random indices for data randomization.
*
* @param paraLength
* The length of the sequence.
* @return An array of indices, ., {4, 3, 1, 5, 0, 2} with length 6.
*********************
*/
public static int[] getRandomIndices(int paraLength) {
Random random = new Random();
int[] resultIndices = new int[paraLength];
// Step 1. Initialize.
for (int i = 0; i < paraLength; i++) {
resultIndices[i] = i;
} // Of for i
// Step 2. Randomly swap.
int tempFirst, tempSecond, tempValue;
for (int i = 0; i < paraLength; i++) {
// Generate two random indices.
tempFirst = (paraLength);
tempSecond = (paraLength);
// Swap.
tempValue = resultIndices[tempFirst];
resultIndices[tempFirst] = resultIndices[tempSecond];
resultIndices[tempSecond] = tempValue;
} // Of for i
return resultIndices;
}// Of getRandomIndices
/**
*********************
* Split the data into training and testing parts.
*
* @param paraTrainingFraction
* The fraction of the training set.
*********************
*/
public static Instances[] splitTrainingTesting(Instances paraDataset, double paraTrainingFraction) {
int tempSize = ();
int[] tempIndices = getRandomIndices(tempSize);
int tempTrainingSize = (int) (tempSize * paraTrainingFraction);
// Empty datasets.
Instances tempTrainingSet = new Instances(paraDataset);
();
Instances tempTestingSet = new Instances(tempTrainingSet);
for (int i = 0; i < tempTrainingSize; i++) {
((tempIndices[i]));
} // Of for i
for (int i = 0; i < tempSize - tempTrainingSize; i++) {
((tempIndices[tempTrainingSize + i]));
} // Of for i
(() - 1);
(() - 1);
Instances[] resultInstanesArray = new Instances[2];
resultInstanesArray[0] = tempTrainingSet;
resultInstanesArray[1] = tempTestingSet;
return resultInstanesArray;
}// Of splitTrainingTesting
/**
********************
* Classify all instances, the results are stored in predicts[].
********************
*/
public double classify(Instances paraTestingSet) {
double tempCorrect = 0;
int[] tempPredicts = new int[()];
for (int i = 0; i < ; i++) {
tempPredicts[i] = classify((i));
if (tempPredicts[i] == (int) (i).classValue()) {
tempCorrect++;
} // Of if
} // Of for i
("" + tempCorrect + " correct over " + + " instances.");
double resultAccuracy = tempCorrect / ;
return resultAccuracy;
}// Of classify
/**
*************************
* Test nominal data.
*************************
*/
public static void testNominal(double paraTrainingFraction) {
("Hello, Naive Bayes. I only want to test the nominal data.");
String tempFilename = "D:/data/";
// String tempFilename = "D:/data/";
Instances tempDataset = null;
try {
FileReader fileReader = new FileReader(tempFilename);
tempDataset = new Instances(fileReader);
();
} catch (Exception ee) {
("Cannot read the file: " + tempFilename + "\r\n" + ee);
(0);
} // Of try
Instances[] tempDatasets = splitTrainingTesting(tempDataset, paraTrainingFraction);
NaiveBayes tempLearner = new NaiveBayes(tempDatasets[0]);
(NOMINAL);
();
();
double tempAccuracy = (tempDatasets[1]);
("The accuracy is: " + tempAccuracy);
}// Of testNominal
}// Of class NaiveBayes
第 59 天: 数值型数据的 NB 算法
- 今天把数值型数据处理的代码加上去.
- 假设所有属性的属性值都服从高斯分布. 也可以做其它假设.
- 将概率密度当成概率值直接使用 Bayes 公式.
- 可以看到, 数值型数据的处理并不会比符号型的复杂.
第 60 天: 小结
描述这 10 天的学习体会, 不少于 10 条.