Kaggle Exercise: Categorical Variables 提交submission 是报错Input contains NaN
描述: 在执行Kaggle Learn上的练习:分类变量的练习的第5步时,我得到了ValueError:在测试集的预测阶段,输入包含NaN,无穷大或对于dtype(‘float32’)而言太大的值。
#### DATASETS LOAD #### import pandas as pd from sklearn.model_selection import train_test_split # Read the data X = pd.read_csv('../input/train.csv', index_col='Id') X_test = pd.read_csv('../input/test.csv', index_col='Id') # Remove rows with missing target, separate target from predictors X.dropna(axis=0, subset=['SalePrice'], inplace=True) y = X.SalePrice X.drop(['SalePrice'], axis=1, inplace=True) # To keep things simple, we'll drop columns with missing values cols_with_missing = [col for col in X.columns if X[col].isnull().any()] X.drop(cols_with_missing, axis=1, inplace=True) X_test.drop(cols_with_missing, axis=1, inplace=True) # Break off validation set from training data X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0) #### IMPUTATION OF MISSING VALUES FOR X_TEST #### from sklearn.impute import SimpleImputer # All categorical columns object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"] # Columns that will be one-hot encoded low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10] # Fill in the lines below: imputation my_imputer = SimpleImputer(strategy='most_frequent') imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test)) # Fill in the lines below: imputation removed column names; put them back imputed_X_test.columns = X_test.columns #### ONEHOT ENCODING FOR DATA ##### from sklearn.preprocessing import OneHotEncoder # Apply one-hot encoder to each column with categorical data OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols])) OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols])) OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols])) # One-hot encoding removed index; put it back OH_cols_train.index = X_train.index OH_cols_valid.index = X_valid.index OH_cols_test.index = X_test.index # Remove categorical columns (will replace with one-hot encoding) num_X_train = X_train.drop(object_cols, axis=1) num_X_valid = X_valid.drop(object_cols, axis=1) num_X_test = X_test.drop(object_cols, axis=1) # Add one-hot encoded columns to numerical features OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1) OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1) OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1) ##### BUILD MODEL AND CREATE SUBMISSION #### from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error # normalize datatypes columns #for colName in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']: # OH_X_train[colName] = OH_X_train[colName].astype('float64') # OH_X_valid[colName] = OH_X_train[colName].astype('float64') # Build model model = RandomForestRegressor(n_estimators=100, random_state=0) model.fit(OH_X_train, y_train) preds_test = model.predict(OH_X_test) # Save test predictions to file #output = pd.DataFrame({'Id': OH_X_test.index, # 'SalePrice': preds_test}) #output.to_csv('submission.csv', index=False) --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-2-2d85be0f6b26> in <module> 74 model = RandomForestRegressor(n_estimators=100, random_state=0) 75 model.fit(OH_X_train, y_train) ---> 76 preds_test = model.predict(OH_X_test) 77 78 # Save test predictions to file /opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in predict(self, X) 691 check_is_fitted(self, 'estimators_') 692 # Check data --> 693 X = self._validate_X_predict(X) 694 695 # Assign chunk of trees to jobs /opt/conda/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _validate_X_predict(self, X) 357 "call `fit` before exploiting the model.") 358 --> 359 return self.estimators_[0]._validate_X_predict(X, check_input=True) 360 361 @property /opt/conda/lib/python3.6/site-packages/sklearn/tree/tree.py in _validate_X_predict(self, X, check_input) 389 """Validate X whenever one tries to predict, apply, predict_proba""" 390 if check_input: --> 391 X = check_array(X, dtype=DTYPE, accept_sparse="csr") 392 if issparse(X) and (X.indices.dtype != np.intc or 393 X.indptr.dtype != np.intc): /opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 540 if force_all_finite: 541 _assert_all_finite(array, --> 542 allow_nan=force_all_finite == 'allow-nan') 543 544 if ensure_min_samples > 0: /opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan) 54 not allow_nan and not np.isfinite(X).all()): 55 type_err = 'infinity' if allow_nan else 'NaN, infinity' ---> 56 raise ValueError(msg_err.format(type_err, X.dtype)) 57 # for object dtype data, we only check for NaNs (GH-13254) 58 elif X.dtype == np.dtype('object') and not allow_nan: ValueError: Input contains NaN, infinity or a value too large for dtype('float32').原因分析: 如错误消息所述,问题是由OH_X_test中的NaN值引起的。 这些值在concat语句中引入,因为数据帧的索引混合在一起。
因此,我在下面的代码中添加了3个修复程序:查看### FIX标签。
# (Optional) Your code here # Define and fit model model = RandomForestRegressor(n_estimators=100, random_state=0) model.fit(OH_X_train, y_train) # Get validation predictions and MAE preds_valid = model.predict(OH_X_valid) print("MAE (Your approach):") print(mean_absolute_error(y_valid, preds_valid)) #### IMPUTATION OF MISSING VALUES FOR X_TEST #### from sklearn.impute import SimpleImputer # Fill in the lines below: imputation my_imputer = SimpleImputer(strategy='most_frequent') imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test)) # Fill in the lines below: imputation removed column names; put them back imputed_X_test.columns = X_test.columns imputed_X_test.index = X_test.index ###FIX #### ONEHOT ENCODING FOR TEST DATA ##### # Fill in the line below: preprocess test data OH_cols_test = pd.DataFrame(OH_encoder.transform(imputed_X_test[low_cardinality_cols])) # One-hot encoding removed index; put it back OH_cols_test.index = imputed_X_test.index ####FIX # Remove categorical columns (will replace with one-hot encoding) num_X_test = imputed_X_test.drop(object_cols, axis=1) ####FIX # Add one-hot encoded columns to numerical features OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1) # Fill in the line below: get test predictions preds_test = model.predict(OH_X_test) #Save test predictions to file output=pd.DataFrame({'Id':X_test.index, 'SalePrice':preds_test}) output.to_csv('submission.csv',index=False)