特征工程学习01-sklearn单机特征工程
小书匠 kindle
0.数据的导入
- from sklearn.datasets import load_iris
- #导入IRIS数据集
- iris=load_iris()
- #特征矩阵
- print(iris.data[:5],len(iris.data))
- #目标向量
- print(iris.target[:5],len(iris.target))
- [[ 5.1 3.5 1.4 0.2]
- [ 4.9 3. 1.4 0.2]
- [ 4.7 3.2 1.3 0.2]
- [ 4.6 3.1 1.5 0.2]
- [ 5. 3.6 1.4 0.2]] 150
- [0 0 0 0 0] 150
1.数据预处理
1.1无量纲化
1.1.1标准化
- from sklearn.preprocessing import StandardScaler
- #标准化,返回值为标准化后的值
- iris_standar=StandardScaler().fit_transform(iris.data)
- print(iris_standar[:5])
- [[-0.90068117 1.03205722 -1.3412724 -1.31297673]
- [-1.14301691 -0.1249576 -1.3412724 -1.31297673]
- [-1.38535265 0.33784833 -1.39813811 -1.31297673]
- [-1.50652052 0.10644536 -1.2844067 -1.31297673]
- [-1.02184904 1.26346019 -1.3412724 -1.31297673]]
1.1.2区间缩放
- from sklearn.preprocessing import MinMaxScaler
- # 区间缩放,返回值为已经缩放到[0,1]的值
- iris_minmax=MinMaxScaler().fit_transform(iris.data)
- print(iris_minmax[:5])
- [[ 0.22222222 0.625 0.06779661 0.04166667]
- [ 0.16666667 0.41666667 0.06779661 0.04166667]
- [ 0.11111111 0.5 0.05084746 0.04166667]
- [ 0.08333333 0.45833333 0.08474576 0.04166667]
- [ 0.19444444 0.66666667 0.06779661 0.04166667]]
1.2对定量特征进行二值化
- from sklearn.preprocessing import Binarizer
- #二值化,分界线设置为3,返回二值化后的特征
- iris_binarizer=Binarizer(threshold=3).fit_transform(iris.data)
- print(iris_binarizer[:5])
- [[ 1. 1. 0. 0.]
- [ 1. 0. 0. 0.]
- [ 1. 1. 0. 0.]
- [ 1. 1. 0. 0.]
- [ 1. 1. 0. 0.]]
1.3对定性特征进行哑编码
- from sklearn.preprocessing import OneHotEncoder
- # 哑编码,对iris的目标集进行哑编码,返回编码后的值
- iris_onehotencoder=OneHotEncoder().fit_transform(iris.target.reshape((-1,1)))
- print(iris.target[-5:])
- print(iris.target.reshape((-1,1))[-5:])
- print(iris_onehotencoder[-5:])
- [2 2 2 2 2]
- [[2]
- [2]
- [2]
- [2]
- [2]]
- (0, 2) 1.0
- (1, 2) 1.0
- (2, 2) 1.0
- (3, 2) 1.0
- (4, 2) 1.0
1.4缺失值计算
- from numpy import vstack, array, nan
- from sklearn.preprocessing import Imputer
- #缺失值计算,返回值为计算缺失值后的数据
- #参数missing_value为缺失值的表示形式,默认为NaN
- #参数strategy为缺失值填充方式,默认为mean(均值)
- iris_imputer=Imputer().fit_transform(vstack((array([nan, nan, nan, nan]), iris.data)))
- print(iris_imputer[:5],len(iris_imputer))
- [[ 5.84333333 3.054 3.75866667 1.19866667]
- [ 5.1 3.5 1.4 0.2 ]
- [ 4.9 3. 1.4 0.2 ]
- [ 4.7 3.2 1.3 0.2 ]
- [ 4.6 3.1 1.5 0.2 ]] 151
1.5数据变换
- from sklearn.preprocessing import PolynomialFeatures
- #多项式转换
- #参数degree为度,默认值为2
- iris_pol=PolynomialFeatures().fit_transform(iris.data)
- print(iris_pol[:5])
- [[ 1. 5.1 3.5 1.4 0.2 26.01 17.85 7.14 1.02 12.25
- 4.9 0.7 1.96 0.28 0.04]
- [ 1. 4.9 3. 1.4 0.2 24.01 14.7 6.86 0.98 9. 4.2
- 0.6 1.96 0.28 0.04]
- [ 1. 4.7 3.2 1.3 0.2 22.09 15.04 6.11 0.94 10.24
- 4.16 0.64 1.69 0.26 0.04]
- [ 1. 4.6 3.1 1.5 0.2 21.16 14.26 6.9 0.92 9.61
- 4.65 0.62 2.25 0.3 0.04]
- [ 1. 5. 3.6 1.4 0.2 25. 18. 7. 1. 12.96
- 5.04 0.72 1.96 0.28 0.04]]
- from numpy import log1p
- from sklearn.preprocessing import FunctionTransformer
- #自定义转换函数为对数函数的数据变换
- #第一个参数是单变元函数
- iris_ftf=FunctionTransformer(log1p).fit_transform(iris.data)
- print(iris_ftf[:5])
- [[ 1.80828877 1.5040774 0.87546874 0.18232156]
- [ 1.77495235 1.38629436 0.87546874 0.18232156]
- [ 1.74046617 1.43508453 0.83290912 0.18232156]
- [ 1.7227666 1.41098697 0.91629073 0.18232156]
- [ 1.79175947 1.5260563 0.87546874 0.18232156]]
2.特征选择
2.1Filter
2.1.1方差选择法
- from sklearn.feature_selection import VarianceThreshold
- #方差选择法,返回值为特征选择后的数据
- #参数threshold为方差的阈值
- iris_vt=VarianceThreshold(threshold=3).fit_transform(iris.data)
- print(iris_vt,len(iris_vt))
- [[ 1.4]
- [ 1.4]
- [ 1.3]
- [ 1.5]
- [ 1.4]
- [ 1.7]
- [ 1.4]
- [ 1.5]
- [ 1.4]
- [ 1.5]
- [ 1.5]
- [ 1.6]
- [ 1.4]
- [ 1.1]
- [ 1.2]
- [ 1.5]
- [ 1.3]
- [ 1.4]
- [ 1.7]
- [ 1.5]
- [ 1.7]
- [ 1.5]
- [ 1. ]
- [ 1.7]
- [ 1.9]
- [ 1.6]
- [ 1.6]
- [ 1.5]
- [ 1.4]
- [ 1.6]
- [ 1.6]
- [ 1.5]
- [ 1.5]
- [ 1.4]
- [ 1.5]
- [ 1.2]
- [ 1.3]
- [ 1.5]
- [ 1.3]
- [ 1.5]
- [ 1.3]
- [ 1.3]
- [ 1.3]
- [ 1.6]
- [ 1.9]
- [ 1.4]
- [ 1.6]
- [ 1.4]
- [ 1.5]
- [ 1.4]
- [ 4.7]
- [ 4.5]
- [ 4.9]
- [ 4. ]
- [ 4.6]
- [ 4.5]
- [ 4.7]
- [ 3.3]
- [ 4.6]
- [ 3.9]
- [ 3.5]
- [ 4.2]
- [ 4. ]
- [ 4.7]
- [ 3.6]
- [ 4.4]
- [ 4.5]
- [ 4.1]
- [ 4.5]
- [ 3.9]
- [ 4.8]
- [ 4. ]
- [ 4.9]
- [ 4.7]
- [ 4.3]
- [ 4.4]
- [ 4.8]
- [ 5. ]
- [ 4.5]
- [ 3.5]
- [ 3.8]
- [ 3.7]
- [ 3.9]
- [ 5.1]
- [ 4.5]
- [ 4.5]
- [ 4.7]
- [ 4.4]
- [ 4.1]
- [ 4. ]
- [ 4.4]
- [ 4.6]
- [ 4. ]
- [ 3.3]
- [ 4.2]
- [ 4.2]
- [ 4.2]
- [ 4.3]
- [ 3. ]
- [ 4.1]
- [ 6. ]
- [ 5.1]
- [ 5.9]
- [ 5.6]
- [ 5.8]
- [ 6.6]
- [ 4.5]
- [ 6.3]
- [ 5.8]
- [ 6.1]
- [ 5.1]
- [ 5.3]
- [ 5.5]
- [ 5. ]
- [ 5.1]
- [ 5.3]
- [ 5.5]
- [ 6.7]
- [ 6.9]
- [ 5. ]
- [ 5.7]
- [ 4.9]
- [ 6.7]
- [ 4.9]
- [ 5.7]
- [ 6. ]
- [ 4.8]
- [ 4.9]
- [ 5.6]
- [ 5.8]
- [ 6.1]
- [ 6.4]
- [ 5.6]
- [ 5.1]
- [ 5.6]
- [ 6.1]
- [ 5.6]
- [ 5.5]
- [ 4.8]
- [ 5.4]
- [ 5.6]
- [ 5.1]
- [ 5.1]
- [ 5.9]
- [ 5.7]
- [ 5.2]
- [ 5. ]
- [ 5.2]
- [ 5.4]
- [ 5.1]] 150
2.1.2相关系数法(此处使用第二篇博客进行修改)
- from sklearn.feature_selection import SelectKBest,chi2
- from scipy.stats import pearsonr
- #选择K个最好的特征,返回选择特征后的数据
- #第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量,输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数
- #参数k为选择的特征个数
- iris_pear=SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)
- print(iris_pear,len(iris_pear))
- [[ 1.4 0.2]
- [ 1.4 0.2]
- [ 1.3 0.2]
- [ 1.5 0.2]
- [ 1.4 0.2]
- [ 1.7 0.4]
- [ 1.4 0.3]
- [ 1.5 0.2]
- [ 1.4 0.2]
- [ 1.5 0.1]
- [ 1.5 0.2]
- [ 1.6 0.2]
- [ 1.4 0.1]
- [ 1.1 0.1]
- [ 1.2 0.2]
- [ 1.5 0.4]
- [ 1.3 0.4]
- [ 1.4 0.3]
- [ 1.7 0.3]
- [ 1.5 0.3]
- [ 1.7 0.2]
- [ 1.5 0.4]
- [ 1. 0.2]
- [ 1.7 0.5]
- [ 1.9 0.2]
- [ 1.6 0.2]
- [ 1.6 0.4]
- [ 1.5 0.2]
- [ 1.4 0.2]
- [ 1.6 0.2]
- [ 1.6 0.2]
- [ 1.5 0.4]
- [ 1.5 0.1]
- [ 1.4 0.2]
- [ 1.5 0.1]
- [ 1.2 0.2]
- [ 1.3 0.2]
- [ 1.5 0.1]
- [ 1.3 0.2]
- [ 1.5 0.2]
- [ 1.3 0.3]
- [ 1.3 0.3]
- [ 1.3 0.2]
- [ 1.6 0.6]
- [ 1.9 0.4]
- [ 1.4 0.3]
- [ 1.6 0.2]
- [ 1.4 0.2]
- [ 1.5 0.2]
- [ 1.4 0.2]
- [ 4.7 1.4]
- [ 4.5 1.5]
- [ 4.9 1.5]
- [ 4. 1.3]
- [ 4.6 1.5]
- [ 4.5 1.3]
- [ 4.7 1.6]
- [ 3.3 1. ]
- [ 4.6 1.3]
- [ 3.9 1.4]
- [ 3.5 1. ]
- [ 4.2 1.5]
- [ 4. 1. ]
- [ 4.7 1.4]
- [ 3.6 1.3]
- [ 4.4 1.4]
- [ 4.5 1.5]
- [ 4.1 1. ]
- [ 4.5 1.5]
- [ 3.9 1.1]
- [ 4.8 1.8]
- [ 4. 1.3]
- [ 4.9 1.5]
- [ 4.7 1.2]
- [ 4.3 1.3]
- [ 4.4 1.4]
- [ 4.8 1.4]
- [ 5. 1.7]
- [ 4.5 1.5]
- [ 3.5 1. ]
- [ 3.8 1.1]
- [ 3.7 1. ]
- [ 3.9 1.2]
- [ 5.1 1.6]
- [ 4.5 1.5]
- [ 4.5 1.6]
- [ 4.7 1.5]
- [ 4.4 1.3]
- [ 4.1 1.3]
- [ 4. 1.3]
- [ 4.4 1.2]
- [ 4.6 1.4]
- [ 4. 1.2]
- [ 3.3 1. ]
- [ 4.2 1.3]
- [ 4.2 1.2]
- [ 4.2 1.3]
- [ 4.3 1.3]
- [ 3. 1.1]
- [ 4.1 1.3]
- [ 6. 2.5]
- [ 5.1 1.9]
- [ 5.9 2.1]
- [ 5.6 1.8]
- [ 5.8 2.2]
- [ 6.6 2.1]
- [ 4.5 1.7]
- [ 6.3 1.8]
- [ 5.8 1.8]
- [ 6.1 2.5]
- [ 5.1 2. ]
- [ 5.3 1.9]
- [ 5.5 2.1]
- [ 5. 2. ]
- [ 5.1 2.4]
- [ 5.3 2.3]
- [ 5.5 1.8]
- [ 6.7 2.2]
- [ 6.9 2.3]
- [ 5. 1.5]
- [ 5.7 2.3]
- [ 4.9 2. ]
- [ 6.7 2. ]
- [ 4.9 1.8]
- [ 5.7 2.1]
- [ 6. 1.8]
- [ 4.8 1.8]
- [ 4.9 1.8]
- [ 5.6 2.1]
- [ 5.8 1.6]
- [ 6.1 1.9]
- [ 6.4 2. ]
- [ 5.6 2.2]
- [ 5.1 1.5]
- [ 5.6 1.4]
- [ 6.1 2.3]
- [ 5.6 2.4]
- [ 5.5 1.8]
- [ 4.8 1.8]
- [ 5.4 2.1]
- [ 5.6 2.4]
- [ 5.1 2.3]
- [ 5.1 1.9]
- [ 5.9 2.3]
- [ 5.7 2.5]
- [ 5.2 2.3]
- [ 5. 1.9]
- [ 5.2 2. ]
- [ 5.4 2.3]
- [ 5.1 1.8]] 150
2.1.3卡方检验
- from sklearn.feature_selection import SelectKBest
- from sklearn.feature_selection import chi2
- #选择K个最好的特征,返回选择特征后的数据
- iris_chi2=SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)
- print(iris_chi2[:5],len(iris_chi2))
- [[ 1.4 0.2]
- [ 1.4 0.2]
- [ 1.3 0.2]
- [ 1.5 0.2]
- [ 1.4 0.2]] 150
2.1.4互信息法
- from sklearn.feature_selection import SelectKBest
- from minepy import MINE
- #由于MINE的设计不是函数式的,定义mic方法将其为函数式的,返回一个二元组,二元组的第2项设置成固定的P值0.5
- def mic(x, y):
- m = MINE()
- m.compute_score(x, y)
- return (m.mic(), 0.5)
- #选择K个最好的特征,返回特征选择后的数据
- SelectKBest(lambda X, Y: array(map(lambda x:mic(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target)
- ---------------------------------------------------------------------------
- ImportError Traceback (most recent call last)
- <ipython-input-47-807ad1fcacee> in <module>()
- 1 from sklearn.feature_selection import SelectKBest
- ----> 2 from minepy import MINE
- 3
- 4 #由于MINE的设计不是函数式的,定义mic方法将其为函数式的,返回一个二元组,二元组的第2项设置成固定的P值0.5
- 5 def mic(x, y):
- ImportError: No module named 'minepy'
2.2Wrapper
3.2.1 递归特征消除法
- from sklearn.feature_selection import RFE
- from sklearn.linear_model import LogisticRegression
- #递归特征消除法,返回特征选择后的数据
- #参数estimator为基模型
- #参数n_features_to_select为选择的特征个数
- iris_pfe=RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target)
- print(iris_pfe[:5])
- [[ 3.5 0.2]
- [ 3. 0.2]
- [ 3.2 0.2]
- [ 3.1 0.2]
- [ 3.6 0.2]]
3.3 Embedded
3.3.1 基于惩罚项的特征选择法
- from sklearn.feature_selection import SelectFromModel
- from sklearn.linear_model import LogisticRegression
- #带L1惩罚项的逻辑回归作为基模型的特征选择
- iris_sfm=SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform(iris.data, iris.target)
- print(iris_sfm[:5])
- [[ 5.1 3.5 1.4]
- [ 4.9 3. 1.4]
- [ 4.7 3.2 1.3]
- [ 4.6 3.1 1.5]
- [ 5. 3.6 1.4]]
- from sklearn.linear_model import LogisticRegression
- class LR(LogisticRegression):
- def __init__(self, threshold=0.01, dual=False, tol=1e-4, C=1.0,
- fit_intercept=True, intercept_scaling=1, class_weight=None,
- random_state=None, solver='liblinear', max_iter=100,
- multi_class='ovr', verbose=0, warm_start=False, n_jobs=1):
- #权值相近的阈值
- self.threshold = threshold
- LogisticRegression.__init__(self, penalty='l1', dual=dual, tol=tol, C=C,
- fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight=class_weight,
- random_state=random_state, solver=solver, max_iter=max_iter,
- multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
- #使用同样的参数创建L2逻辑回归
- self.l2 = LogisticRegression(penalty='l2', dual=dual, tol=tol, C=C, fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, class_weight = class_weight, random_state=random_state, solver=solver, max_iter=max_iter, multi_class=multi_class, verbose=verbose, warm_start=warm_start, n_jobs=n_jobs)
- def fit(self, X, y, sample_weight=None):
- #训练L1逻辑回归
- super(LR, self).fit(X, y, sample_weight=sample_weight)
- self.coef_old_ = self.coef_.copy()
- #训练L2逻辑回归
- self.l2.fit(X, y, sample_weight=sample_weight)
- cntOfRow, cntOfCol = self.coef_.shape
- #权值系数矩阵的行数对应目标值的种类数目
- for i in range(cntOfRow):
- for j in range(cntOfCol):
- coef = self.coef_[i][j]
- #L1逻辑回归的权值系数不为0
- if coef != 0:
- idx = [j]
- #对应在L2逻辑回归中的权值系数
- coef1 = self.l2.coef_[i][j]
- for k in range(cntOfCol):
- coef2 = self.l2.coef_[i][k]
- #在L2逻辑回归中,权值系数之差小于设定的阈值,且在L1中对应的权值为0
- if abs(coef1-coef2) < self.threshold and j != k and self.coef_[i][k] == 0:
- idx.append(k)
- #计算这一类特征的权值系数均值
- mean = coef / len(idx)
- self.coef_[i][idx] = mean
- return self
- from sklearn.feature_selection import SelectFromModel
- #带L1和L2惩罚项的逻辑回归作为基模型的特征选择
- #参数threshold为权值系数之差的阈值
- iris_sfm2=SelectFromModel(LR(threshold=0.5, C=0.1)).fit_transform(iris.data, iris.target)
- print(iris_sfm2[:5])
- [[ 5.1 3.5 1.4 0.2]
- [ 4.9 3. 1.4 0.2]
- [ 4.7 3.2 1.3 0.2]
- [ 4.6 3.1 1.5 0.2]
- [ 5. 3.6 1.4 0.2]]
3.3.2 基于树模型的特征选择法
- from sklearn.feature_selection import SelectFromModel
- from sklearn.ensemble import GradientBoostingClassifier
- #GBDT作为基模型的特征选择
- iris_sfm3=SelectFromModel(GradientBoostingClassifier()).fit_transform(iris.data, iris.target)
- print(iris_sfm3[:5])
- [[ 1.4 0.2]
- [ 1.4 0.2]
- [ 1.3 0.2]
- [ 1.5 0.2]
- [ 1.4 0.2]]
4 降维
4.1 主成分分析法(PCA)
- from sklearn.decomposition import PCA
- #主成分分析法,返回降维后的数据
- #参数n_components为主成分数目
- iris_pca=PCA(n_components=2).fit_transform(iris.data)
- print(iris_pca[:5])
- [[-2.68420713 0.32660731]
- [-2.71539062 -0.16955685]
- [-2.88981954 -0.13734561]
- [-2.7464372 -0.31112432]
- [-2.72859298 0.33392456]]
4.2 线性判别分析法(LDA)
- from sklearn.lda import LDA
- #线性判别分析法,返回降维后的数据
- #参数n_components为降维后的维数
- LDA(n_components=2).fit_transform(iris.data, iris.target)
- ---------------------------------------------------------------------------
- ImportError Traceback (most recent call last)
- <ipython-input-56-21fd5d727aec> in <module>()
- ----> 1 from sklearn.lda import LDA
- 2
- 3 #线性判别分析法,返回降维后的数据
- 4 #参数n_components为降维后的维数
- 5 LDA(n_components=2).fit_transform(iris.data, iris.target)
- ImportError: No module named 'sklearn.lda'
参考文章