挑选出最佳的部分特征
‘bachelor’,‘gender’, ‘age’, 'salary’这4个特征,从观察上来看,'bachelor’是没有什么变异性的
X=df[['bachelor','gender', 'age', 'salary']] from sklearn.feature_selection import VarianceThreshold vt=VarianceThreshold() X_val=vt.fit_transform(X) X_val转换后的X就是已经移除了’bachelor’
vt.get_support() >>>array([False, True, True, True]) X.columns[vt.get_support()] >>>Index(['gender', 'age', 'salary'], dtype='object')考虑到目标数组y的实际值,来挑选最合适的特征
from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 X=df[['bachelor','gender', 'age', 'salary']] y=df['purchased'].values sb=SelectKBest(chi2,k=2)#回归:f_regression 分类:chi2 、f_classif sb.fit(X,y) print(sb.scores_) >>>[ 0. 0. 4.48447205 2766.66666667] X.columns[sb.get_support()] >>>Index(['age', 'salary'], dtype='object') X_new=sb.fit_transform(X,y) print(X_new)所有的特征有多到少排列组合,依次剔除特征,逐步选择最优的特征
from sklearn.feature_selection import RFE from sklearn.svm import SVC svc=SVC(kernel='linear') rfe=RFE(svc,n_features_to_select=1) rfe.fit(X_val,y) #特征重要性排名 for x in rfe.ranking_: print(X.columns[vt.get_support()][x-1],rfe.ranking_[x-1])参考该案例(6)
from sklearn.ensemble import RandomForestClassifier rfc=RandomForestClassifier(n_estimators=10,random_state=123) rfc.fit(X_val,y) for feature in zip(X.columns[vt.get_support()],rfc.feature_importances_): print(feature) %matplotlib inline import matplotlib.pyplot as plt plt.bar(range(0,3),rfc.feature_importances_) plt.xticks(range(0,3),X.columns[vt.get_support()]) plt.title('Feature Importance')将数据群由高纬度的空间投影到低纬度的空间
主成分保留了原始变量的大部分信息,个数减少,互不相关,每个主成分都是原始变量的线型组合 准备数据
from sklearn.datasets import load_iris iris = load_iris() X = iris.data y = iris.target X.shape >>>(150, 4)主成分分析
from sklearn.decomposition import PCA pca=PCA(n_components=2)#4个特征压缩到2个 pca.fit(X)原数据转化成二维的特征矩阵
X_reduced=pca.transform(X) X_reduced.shape >>> (150, 2)建立决策边界
from itertools import product import numpy as np import matplotlib.pyplot as plt def plot_estimator(estimator, X, y): x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.plot() plt.contourf(xx, yy, Z, alpha=0.4, cmap = plt.cm.RdYlBu) plt.scatter(X[:, 0], X[:, 1], c=y, cmap = plt.cm.brg) plt.xlabel('Component1') plt.ylabel('Component2') plt.show() plot_estimator(svc, X_reduced, y)相关系数
pca.components_#相关系数 >>>array([[ 0.36158968, -0.08226889, 0.85657211, 0.35884393], [ 0.65653988, 0.72971237, -0.1757674 , -0.07470647]]) #主成分组成 for component in pca.components_: print('+'.join('%.3f * %s'%(value,name) for value,name in zip(component,iris.feature_names))) >>> 0.362 * sepal length (cm)+-0.082 * sepal width (cm)+0.857 * petal length (cm)+0.359 * petal width (cm) 0.657 * sepal length (cm)+0.730 * sepal width (cm)+-0.176 * petal length (cm)+-0.075 * petal width (cm)解释度
pca.explained_variance_#两个主成分的解释度 >>>array([4.22484077, 0.24224357]) plt.bar(range(0,2),pca.explained_variance_) plt.xticks(range(0,2),['component1','component2']) pca.explained_variance_ratio_#解释度占比 >>>array([0.92461621, 0.05301557])对特定数据集拆解,保留重要信息组成新数据集合 准备数据
from sklearn.datasets import load_iris iris = load_iris() X = iris.data y = iris.target X.shape >>>(150, 4)1.SVD矩阵分解和还原 分解:X被分解成U S V
from scipy.linalg import svd U,S,V=svd(X,full_matrices=False) U.shape,S.shape,V.shape >>>((150, 4), (4,), (4, 4))还原:U S V还原成A_new=X
A_new=U.dot(np.diag(S)).dot(V)2.SVD降维
from sklearn.decomposition import TruncatedSVD svd=TruncatedSVD(2) X_new=svd.fit_transform(X)建立决策边界 相关系数 解释度 与PCA类似。PCA作用域协方差矩阵,SVD用于一般矩阵。
1.查看原图
import numpy as np from PIL import Image img=Image.open('E:/Jupyter workspace/python_for_data_science/Data/iceice.png') img2.图像数据转换成矩阵
imgary=np.array(img) imgary=imgary/255#标准化0-1 imgary.shape#(4列分别代表:r红 g绿 b蓝 a透明度) >>>(400, 640, 4)#垂直高度像素,水平宽度像素,rgba)3.拆分rgb
img_red=imgary[:,:,0] img_green=imgary[:,:,1] img_blue=imgary[:,:,2]4.SVD分解
from numpy.linalg import svd U_r,S_r,V_r=svd(img_red,full_matrices=True) U_g,S_g,V_g=svd(img_green,full_matrices=True) U_b,S_b,V_b=svd(img_blue,full_matrices=True)5.选取部分特征值
k=50 U_r_k=U_r[:,0:k] S_r_k=S_r[0:k] V_r_k=V_r[:k,:] U_g_k=U_g[:,0:k] S_g_k=S_g[0:k] V_g_k=V_g[:k,:] U_b_k=U_b[:,0:k] S_b_k=S_b[0:k] V_b_k=V_b[:k,:]6.还原矩阵
image_red_approx=U_r_k.dot(np.diag(S_r_k)).dot(V_r_k) image_green_approx=U_g_k.dot(np.diag(S_g_k)).dot(V_g_k) image_blue_approx=U_b_k.dot(np.diag(S_b_k)).dot(V_b_k)7.重构新图片矩阵
img_reconstructed=np.stack((image_red_approx,image_green_approx,image_blue_approx),axis=2) img_reconstructed.shape >>>(400, 640, 3)8.处理异常值
img_reconstructed[img_reconstructed>1]=1 img_reconstructed[img_reconstructed<0]=09.绘制图片
%matplotlib inline import matplotlib.pyplot as plt plt.imshow(img_reconstructed