随机森林RF特征重要性排序及打分

it2024-03-29 54

源代码：

import numpy as np #可用来存储和处理大型矩阵 import scipy # Scipy是一个用于数学、科学、工程领域的常用软件包，可以处理插值、积分、优化、图像处理、常微分方程数值解的求解、信号处理等问题。它用于有效计算Numpy矩阵，使Numpy和Scipy协同工作，高效解决问题。 import pandas as pd # Pandas是一个强大的分析结构化数据的工具集；它的使用基础是Numpy（提供高性能的矩阵运算）；用于数据挖掘和数据分析，同时也提供数据清洗功能 from sklearn.ensemble import RandomForestClassifier #导入随机森林分类器 from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV # train_test_split用于训练集合和验证集合的划分 from sklearn.metrics import roc_curve, auc, roc_auc_score import matplotlib.pyplot as plt df=pd.read_csv('E:\Pycharm\PycharmLearning\RF\\RFtestyu1012.csv', encoding='gbk') #导入数据，因为数据列名含有中文，所以需要加encoding='gbk'，防止乱码 df.head() # df.head()会将excel表格中的第一行看作列名，并默认输出之后的五行，在head后面的括号里面直接写你想要输出的行数也行，比如2，10，100之类的。 print (df.head()) print(df.label.nunique()) # print(df.label.value_counts()) #查找数据“isrun”列里面值的个数 df.label=df.label.astype(str) #先将isrun用astype(str)转化为字符型,然后将标签值转化为数值，尤其适用于二值型；再将转化后的值赋予新的isrun替代掉 y=df.label # 再将isrun赋予y print (y.head()) # 查看y的head函数（前5行） x=df.drop('label', axis=1) #dataframe.drop('isrun', axis=1) #生成x的时候扔掉isrun一列，axis=1的意思是删掉的是一列，而不是一行 print (x.head()) #查看x的head函数（前5行） seed=5# 设置随机数生成器的种子，seed()函数的功能是每次改变随机数生成器的种子，会改变下一次随机数模块生成的随机数。seed()方法在每次调用随机函数之前使用，seed()是不能直接访问的，需要导入 random 模块，然后通过 random 静态对象调用该方法 xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=seed) # #分别将x,y的训练样本与验证样本随机分成7:3的比例，赋予xtrain, xtest, ytrain, ytest rfc = RandomForestClassifier() #实例化 rfc = rfc.fit(xtrain,ytrain) #调用.fit接口，用训练集数据训练模型 result = rfc.score(xtest,ytest) #导入测试集，rfc的接口score计算的是模型准确率accuracy print (result) # 计算结果（可以print一下） print ('所有的树:%s' % rfc.estimators_) print (rfc.classes_) print (rfc.n_classes_) print ('判定结果:%s' % rfc.predict(xtest)) print ('判定结果:%s' % rfc.predict_proba(xtest)[:,:]) print ('各feature的重要性：%s' % rfc.feature_importances_) #浮点数越大，代表这个特征越重要 importances = rfc.feature_importances_ std = np.std([tree.feature_importances_ for tree in rfc.estimators_], axis=0) indices = np.argsort(importances)[::-1]# Print the feature ranking print("Feature ranking:") for f in range(min(20,xtrain.shape[1])): random_state = 5 print("%2d) %-*s %f" % (f + 1, 30, xtrain.columns[indices[f]], importances[indices[f]]),)# Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(xtrain.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(xtrain.shape[1]), indices) plt.xlim([-1, xtrain.shape[1]]) plt.show() #交叉验证 #sklearn.model_selection.cross_val_score(estimator, X, y, scoring=None, cv=3, n_jobs=1, verbose=0, fit_params=None, pre_dispatch=‘2*n_jobs’) #estimator: 估计方法对象(分类器)；X：数据特征(Features)；y：数据标签(Labels)， #soring：调用方法(包括accuracy和mean_squared_error等等) #cv：几折交叉验证，n_jobs：同时工作的cpu个数（-1代表全部） clf2 = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=3, random_state=0) scores = cross_val_score(clf2, xtrain, ytrain) print(scores.mean())

改进：

import math import xlrd from sklearn.metrics import mean_squared_error # 用于评估模型 workbook = xlrd.open_workbook('E:\Pycharm\PycharmLearning\RMSE_Test\\RMSE_20190415.xlsx', 'r') sheet1 = workbook.sheet_by_index(0) for i in range(6): if i<=2: ScoreGF = math.sqrt(mean_squared_error(sheet1.col_values(i+5)[1:], sheet1.col_values(i+17)[1:])) print('The GF_B'+str(i+1)+' rmse is', ScoreGF) elif i==3: ScoreGF = math.sqrt(mean_squared_error(sheet1.col_values(i + 5)[1:], sheet1.col_values(i + 19)[1:])) print('The GF_B' + str(i + 1) + ' rmse is', ScoreGF) else: ScoreGF = math.sqrt(mean_squared_error(sheet1.col_values(i + 5)[1:], sheet1.col_values(i + 16)[1:])) print('The GF_B' + str(i + 1) + ' rmse is', ScoreGF) for i in range(6): if i<=2: ScoreGF2=math.sqrt(mean_squared_error( sheet1.col_values(i+11)[1:] ,sheet1.col_values(i+17)[1:])) print('The GF_B'+str(i+1)+'_2 rmse is', ScoreGF2) elif i==3: ScoreGF2 = math.sqrt(mean_squared_error(sheet1.col_values(i + 11)[1:], sheet1.col_values(i + 19)[1:])) print('The GF_B' + str(i + 1) + '_2 rmse is', ScoreGF2) else: ScoreGF2 = math.sqrt(mean_squared_error(sheet1.col_values(i + 11)[1:], sheet1.col_values(i + 16)[1:])) print('The GF_B' + str(i + 1) + '_2 rmse is', ScoreGF2) print(u'结束！')

最新回复(0)