数据处理照搬
import numpy
as np
import pandas
as pd
%matplotlib inline
import matplotlib
.pyplot
as plt
import seaborn
as sns
color
= sns
.color_palette
()
sns
.set_style
('darkgrid')
import warnings
warnings
.filterwarnings
('ignore')
from scipy
import stats
from scipy
.stats
import norm
, skew
train
= pd
.read_csv
(r
'C:\Users\hp\Desktop\train.csv')
test
= pd
.read_csv
(r
'C:\Users\hp\Desktop\test.csv')
train_ID
= train
['Id']
test_ID
= test
['Id']
train
.drop
("Id", axis
= 1, inplace
= True)
test
.drop
("Id", axis
= 1, inplace
= True)
train
= train
.drop
(train
[(train
['GrLivArea']>4000) & (train
['SalePrice']<300000)].index
)
train
["SalePrice"] = np
.log1p
(train
["SalePrice"])
ntrain
= train
.shape
[0]
ntest
= test
.shape
[0]
y_train
= train
.SalePrice
.values
all_data
= pd
.concat
((train
, test
)).reset_index
(drop
=True)
all_data
.drop
(['SalePrice'], axis
=1, inplace
=True)
print("all_data size is : {}".format(all_data
.shape
))
all_data_na
= (all_data
.isnull
().sum() / len(all_data
)) * 100
all_data_na
= all_data_na
.drop
(all_data_na
[all_data_na
== 0].index
).sort_values
(ascending
=False)[:30]
missing_data
= pd
.DataFrame
({'Missing Ratio' :all_data_na
})
all_data
["PoolQC"] = all_data
["PoolQC"].fillna
("None")
all_data
["MiscFeature"] = all_data
["MiscFeature"].fillna
("None")
all_data
["Alley"] = all_data
["Alley"].fillna
("None")
all_data
["Fence"] = all_data
["Fence"].fillna
("None")
all_data
["FireplaceQu"] = all_data
["FireplaceQu"].fillna
("None")
all_data
["LotFrontage"] = all_data
.groupby
("Neighborhood")["LotFrontage"].transform
(
lambda x
: x
.fillna
(x
.median
()))
for col
in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
all_data
[col
] = all_data
[col
].fillna
('None')
for col
in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
all_data
[col
] = all_data
[col
].fillna
(0)
for col
in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
all_data
[col
] = all_data
[col
].fillna
(0)
for col
in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
all_data
[col
] = all_data
[col
].fillna
('None')
all_data
["MasVnrType"] = all_data
["MasVnrType"].fillna
("None")
all_data
["MasVnrArea"] = all_data
["MasVnrArea"].fillna
(0)
all_data
['MSZoning'] = all_data
['MSZoning'].fillna
(all_data
['MSZoning'].mode
()[0])
all_data
= all_data
.drop
(['Utilities'], axis
=1)
all_data
["Functional"] = all_data
["Functional"].fillna
("Typ")
all_data
['Electrical'] = all_data
['Electrical'].fillna
(all_data
['Electrical'].mode
()[0])
all_data
['KitchenQual'] = all_data
['KitchenQual'].fillna
(all_data
['KitchenQual'].mode
()[0])
all_data
['Exterior1st'] = all_data
['Exterior1st'].fillna
(all_data
['Exterior1st'].mode
()[0])
all_data
['Exterior2nd'] = all_data
['Exterior2nd'].fillna
(all_data
['Exterior2nd'].mode
()[0])
all_data
['SaleType'] = all_data
['SaleType'].fillna
(all_data
['SaleType'].mode
()[0])
all_data
['MSSubClass'] = all_data
['MSSubClass'].fillna
("None")
all_data_na
= (all_data
.isnull
().sum() / len(all_data
)) * 100
all_data_na
= all_data_na
.drop
(all_data_na
[all_data_na
== 0].index
).sort_values
(ascending
=False)
missing_data
= pd
.DataFrame
({'Missing Ratio' :all_data_na
})
all_data
['MSSubClass'] = all_data
['MSSubClass'].apply(str)
all_data
['OverallCond'] = all_data
['OverallCond'].astype
(str)
all_data
['YrSold'] = all_data
['YrSold'].astype
(str)
all_data
['MoSold'] = all_data
['MoSold'].astype
(str)
from sklearn
.preprocessing
import LabelEncoder
cols
= ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
'YrSold', 'MoSold')
for c
in cols
:
lbl
= LabelEncoder
()
lbl
.fit
(list(all_data
[c
].values
))
all_data
[c
] = lbl
.transform
(list(all_data
[c
].values
))
all_data
['TotalSF'] = all_data
['TotalBsmtSF'] + all_data
['1stFlrSF'] + all_data
['2ndFlrSF']
numeric_feats
= all_data
.dtypes
[all_data
.dtypes
!= "object"].index
skewed_feats
= all_data
[numeric_feats
].apply(lambda x
: skew
(x
.dropna
())).sort_values
(ascending
=False)
print("\nSkew in numerical features: \n")
skewness
= pd
.DataFrame
({'Skew' :skewed_feats
})
skewness
= skewness
[abs(skewness
) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness
.shape
[0]))
from scipy
.special
import boxcox1p
skewed_features
= skewness
.index
lam
= 0.15
for feat
in skewed_features
:
all_data
[feat
] = boxcox1p
(all_data
[feat
], lam
)
all_data
= pd
.get_dummies
(all_data
)
train
= all_data
[:ntrain
]
test
= all_data
[ntrain
:]
普通模型
from sklearn
.linear_model
import ElasticNet
, Lasso
, BayesianRidge
, LassoLarsIC
from sklearn
.ensemble
import RandomForestRegressor
, GradientBoostingRegressor
from sklearn
.kernel_ridge
import KernelRidge
from sklearn
.pipeline
import make_pipeline
from sklearn
.preprocessing
import RobustScaler
from sklearn
.base
import BaseEstimator
, TransformerMixin
, RegressorMixin
, clone
from sklearn
.model_selection
import KFold
, cross_val_score
, train_test_split
from sklearn
.metrics
import mean_squared_error
import xgboost
as xgb
import lightgbm
as lgb
from sklearn
.model_selection
import GridSearchCV
from sklearn
.ensemble
import RandomForestRegressor
,VotingRegressor
from sklearn
.ensemble
import StackingRegressor
KRR
= KernelRidge
(alpha
=0.6, kernel
='polynomial')
param_grid
= dict(degree
= [1,1.5,2,2.5,3],coef0
= [2,2.5,3,3.5,4,4.5])
KRR_best
= GridSearchCV
(KRR
, param_grid
, cv
=5)
KRR_best
.fit
(train
,y_train
)
print("模型的最优参数:",KRR_best
.best_params_
)
print("最优模型分数:",KRR_best
.best_score_
)
print("最优模型对象:",KRR_best
.best_estimator_
)
GBoost
= GradientBoostingRegressor
(max_depth
=4, max_features
='sqrt',
min_samples_leaf
=15,min_samples_split
=10,
loss
='huber', random_state
=5)
param_grid
= dict(learning_rate
= [0.001,0.01,0.05],
n_estimators
= [3000,4000,5000])
GBoost_best
= GridSearchCV
(GBoost
, param_grid
, cv
=5)
GBoost_best
.fit
(train
,y_train
)
print("模型的最优参数:",GBoost_best
.best_params_
)
print("最优模型分数:",GBoost_best
.best_score_
)
print("最优模型对象:",GBoost_best
.best_estimator_
)
GBoost
= GradientBoostingRegressor
(learning_rate
=0.01,n_estimators
=4000,
max_features
='sqrt',loss
='huber', random_state
=5)
param_grid
= dict(max_depth
=[3,4,5],min_samples_leaf
= [10,15,20],
min_samples_split
= [5,10,15])
GBoost_best
= GridSearchCV
(GBoost
, param_grid
, cv
=5)
GBoost_best
.fit
(train
,y_train
)
print("模型的最优参数:",GBoost_best
.best_params_
)
print("最优模型分数:",GBoost_best
.best_score_
)
print("最优模型对象:",GBoost_best
.best_estimator_
)
XGB
= xgb
.XGBRegressor
(colsample_bytree
=0.4603,gamma
=0.0468,max_depth
=3,
min_child_weight
=1.7817,reg_alpha
=0.4640,
reg_lambda
=0.8571,subsample
=0.5213,
random_state
=7, nthread
= -1)
param_grid
= dict(learning_rate
= [0.001,0.05,0.1],
n_estimators
= [2200,3000,3500])
XGB_best
= GridSearchCV
(XGB
, param_grid
, cv
=5)
XGB_best
.fit
(train
,y_train
)
print("模型的最优参数:",XGB_best
.best_params_
)
print("最优模型分数:",XGB_best
.best_score_
)
print("最优模型对象:",XGB_best
.best_estimator_
)
LGB
= lgb
.LGBMRegressor
(objective
='regression',num_leaves
=5,
max_bin
= 55, bagging_fraction
= 0.8,
bagging_freq
= 5, feature_fraction
= 0.2319,
feature_fraction_seed
=9, bagging_seed
=9,
min_data_in_leaf
=6, min_sum_hessian_in_leaf
= 11)
param_grid
= dict(learning_rate
= [0.001,0.05,0.1],
n_estimators
= [600,720,840])
LGB_best
= GridSearchCV
(LGB
, param_grid
, cv
=5)
LGB_best
.fit
(train
,y_train
)
print("模型的最优参数:",LGB_best
.best_params_
)
print("最优模型分数:",LGB_best
.best_score_
)
print("最优模型对象:",LGB_best
.best_estimator_
)
n_folds
= 5
def rmsle_cv(model
):
kf
= KFold
(n_folds
, shuffle
=True, random_state
=42).get_n_splits
(train
.values
)
rmse
= np
.sqrt
(-cross_val_score
(model
, train
.values
, y_train
, scoring
="neg_mean_squared_error", cv
= kf
))
return(rmse
)
KRR_best
= KernelRidge
(alpha
=0.6, coef0
=4, degree
=2, kernel
='polynomial')
score
= rmsle_cv
(KRR_best
)
print("\nKRR score: {:.4f} ({:.4f})\n".format(score
.mean
(), score
.std
()))
GBoost_best
= GradientBoostingRegressor
(alpha
=0.9,learning_rate
=0.01, loss
='huber',
max_depth
=3,max_features
='sqrt', min_samples_leaf
=10, min_samples_split
=5,
n_estimators
=4000,random_state
=5)
score
= rmsle_cv
(GBoost_best
)
print("\nGBoost score: {:.4f} ({:.4f})\n".format(score
.mean
(), score
.std
()))
XGB_best
= xgb
.XGBRegressor
(colsample_bytree
=0.4603, gamma
=0.0468, learning_rate
=0.05,
max_depth
=3, min_child_weight
=1.7817,n_estimators
=3500,nthread
=-1,
random_state
=7, reg_alpha
=0.464,reg_lambda
=0.8571,subsample
=0.5213)
score
= rmsle_cv
(XGB_best
)
print("\nXGB score: {:.4f} ({:.4f})\n".format(score
.mean
(), score
.std
()))
LGB_best
= lgb
.LGBMRegressor
(bagging_fraction
=0.8, bagging_freq
=5, bagging_seed
=9,
feature_fraction
=0.2319,feature_fraction_seed
=9, learning_rate
=0.05,
max_bin
=55,min_data_in_leaf
=6,min_sum_hessian_in_leaf
=11,n_estimators
=720,
num_leaves
=5,objective
='regression')
score
= rmsle_cv
(LGB_best
)
print("\nLGB score: {:.4f} ({:.4f})\n".format(score
.mean
(), score
.std
()))
组合模型
Stacking models
estimators
= [('KNN', KernelRidge
(alpha
=0.6, coef0
=4, degree
=2, kernel
='polynomial')),
('lgb', lgb
.LGBMRegressor
(bagging_fraction
=0.8, bagging_freq
=5, bagging_seed
=9,
feature_fraction
=0.2319,feature_fraction_seed
=9, learning_rate
=0.05,
max_bin
=55,min_data_in_leaf
=6,min_sum_hessian_in_leaf
=11,n_estimators
=720,
num_leaves
=5,objective
='regression')),
('xgb',xgb
.XGBRegressor
(colsample_bytree
=0.4603, gamma
=0.0468, learning_rate
=0.05,
max_depth
=3, min_child_weight
=1.7817,n_estimators
=3500,nthread
=-1,
random_state
=7, reg_alpha
=0.464,reg_lambda
=0.8571,subsample
=0.5213))]
sta
= StackingRegressor
(estimators
=estimators
,
final_estimator
=GradientBoostingRegressor
(alpha
=0.9,learning_rate
=0.01, loss
='huber',
max_depth
=3,max_features
='sqrt', min_samples_leaf
=10, min_samples_split
=5,
n_estimators
=4000,random_state
=5))
sta
.fit
(train
,y_train
)
score
= rmsle_cv
(sta
)
print("\nsta score: {:.4f} ({:.4f})\n".format(score
.mean
(), score
.std
()))
voting models
vot
= VotingRegressor
(estimators
=[('KNN', KernelRidge
(alpha
=0.6, coef0
=4, degree
=2, kernel
='polynomial')),
('lgb', lgb
.LGBMRegressor
(bagging_fraction
=0.8, bagging_freq
=5, bagging_seed
=9,
feature_fraction
=0.2319,feature_fraction_seed
=9, learning_rate
=0.05,
max_bin
=55,min_data_in_leaf
=6,min_sum_hessian_in_leaf
=11,n_estimators
=720,
num_leaves
=5,objective
='regression')),
('GBoost',GradientBoostingRegressor
(alpha
=0.9,learning_rate
=0.01, loss
='huber',
max_depth
=3,max_features
='sqrt', min_samples_leaf
=10, min_samples_split
=5,
n_estimators
=4000,random_state
=5)),
('xgb',xgb
.XGBRegressor
(colsample_bytree
=0.4603, gamma
=0.0468, learning_rate
=0.05,
max_depth
=3, min_child_weight
=1.7817,n_estimators
=3500,nthread
=-1,
random_state
=7, reg_alpha
=0.464,reg_lambda
=0.8571,subsample
=0.5213))])
vot
.fit
(train
,y_train
)
score
= rmsle_cv
(vot
)
print("\nvot score: {:.4f} ({:.4f})\n".format(score
.mean
(), score
.std
()))
加权组合
def rmsle(y
, y_pred
):
return np
.sqrt
(mean_squared_error
(y
, y_pred
))
GBoost_best
.fit
(train
, y_train
)
GBoost_train_pred
= GBoost_best
.predict
(train
)
GBoost_pred
= np
.expm1
(GBoost_best
.predict
(test
.values
))
print(rmsle
(y_train
, GBoost_train_pred
))
KRR_best
.fit
(train
, y_train
)
KRR_train_pred
= KRR_best
.predict
(train
)
KRR_pred
= np
.expm1
(KRR_best
.predict
(test
.values
))
print(rmsle
(y_train
, KRR_train_pred
))
LGB_best
.fit
(train
, y_train
)
LGB_train_pred
= LGB_best
.predict
(train
)
LGB_pred
= np
.expm1
(LGB_best
.predict
(test
.values
))
print(rmsle
(y_train
, LGB_train_pred
))
vot
.fit
(train
.values
, y_train
)
vot_train_pred
= vot
.predict
(train
.values
)
vot_pred
= np
.expm1
(vot
.predict
(test
.values
))
print(rmsle
(y_train
, vot_train_pred
))
print(rmsle
(y_train
,vot_train_pred
*0.6 + GBoost_train_pred
*0.2 + LGB_train_pred
*0.2 ))