Kaggle Titanic 生存预测比赛超完整笔记(下)

本文章向大家介绍Kaggle Titanic 生存预测比赛超完整笔记(下),主要内容包括6. 模型融合及测试、7. 验证:学习曲线、8. 超参数调试、参考引用:、基本概念、基础应用、原理机制和需要注意的事项等,并结合实例形式分析了其使用技巧,希望通过本文能帮助到大家理解应用这部分内容。

一直想在Kaggle上参加一次比赛,奈何被各种事情所拖累。为了熟悉一下比赛的流程和对数据建模有个较为直观的认识,断断续续用一段时间做了Kaggle上的入门比赛:Titanic: Machine Learning from Disaster。



6. 模型融合及测试


(1) 利用不同的模型来对特征进行筛选,选出较为重要的特征:

from sklearn.ensemble import RandomForestClassifierfrom sklearn.ensemble import AdaBoostClassifierfrom sklearn.ensemble import ExtraTreesClassifierfrom sklearn.ensemble import GradientBoostingClassifierfrom sklearn.tree import DecisionTreeClassifierdef get_top_n_features(titanic_train_data_X, titanic_train_data_Y, top_n_features):

    # random forest
    rf_est = RandomForestClassifier(random_state=0)
    rf_param_grid = {'n_estimators': [500], 'min_samples_split': [2, 3], 'max_depth': [20]}
    rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1)
    rf_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best RF Params:' + str(rf_grid.best_params_))
    print('Top N Features Best RF Score:' + str(rf_grid.best_score_))
    print('Top N Features RF Train Score:' + str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_rf = pd.DataFrame({'feature': list(titanic_train_data_X),
                                          'importance': rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']
    print('Sample 10 Features from RF Classifier')

    # AdaBoost
    ada_est =AdaBoostClassifier(random_state=0)
    ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1]}
    ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=25, cv=10, verbose=1)
    ada_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best Ada Params:' + str(ada_grid.best_params_))
    print('Top N Features Best Ada Score:' + str(ada_grid.best_score_))
    print('Top N Features Ada Train Score:' + str(ada_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_ada = pd.DataFrame({'feature': list(titanic_train_data_X),
                                           'importance': ada_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']
    print('Sample 10 Feature from Ada Classifier:')

    # ExtraTree
    et_est = ExtraTreesClassifier(random_state=0)
    et_param_grid = {'n_estimators': [500], 'min_samples_split': [3, 4], 'max_depth': [20]}
    et_grid = model_selection.GridSearchCV(et_est, et_param_grid, n_jobs=25, cv=10, verbose=1)
    et_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best ET Params:' + str(et_grid.best_params_))
    print('Top N Features Best ET Score:' + str(et_grid.best_score_))
    print('Top N Features ET Train Score:' + str(et_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_et = pd.DataFrame({'feature': list(titanic_train_data_X),
                                          'importance': et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']
    print('Sample 10 Features from ET Classifier:')
    # GradientBoosting
    gb_est =GradientBoostingClassifier(random_state=0)
    gb_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1], 'max_depth': [20]}
    gb_grid = model_selection.GridSearchCV(gb_est, gb_param_grid, n_jobs=25, cv=10, verbose=1)
    gb_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best GB Params:' + str(gb_grid.best_params_))
    print('Top N Features Best GB Score:' + str(gb_grid.best_score_))
    print('Top N Features GB Train Score:' + str(gb_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_gb = pd.DataFrame({'feature': list(titanic_train_data_X),
                                           'importance': gb_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature']
    print('Sample 10 Feature from GB Classifier:')
    # DecisionTree
    dt_est = DecisionTreeClassifier(random_state=0)
    dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]}
    dt_grid = model_selection.GridSearchCV(dt_est, dt_param_grid, n_jobs=25, cv=10, verbose=1)
    dt_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    print('Top N Features Best DT Params:' + str(dt_grid.best_params_))
    print('Top N Features Best DT Score:' + str(dt_grid.best_score_))
    print('Top N Features DT Train Score:' + str(dt_grid.score(titanic_train_data_X, titanic_train_data_Y)))
    feature_imp_sorted_dt = pd.DataFrame({'feature': list(titanic_train_data_X),
                                          'importance': dt_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature']
    print('Sample 10 Features from DT Classifier:')
    # merge the three models
    features_top_n = pd.concat([features_top_n_rf, features_top_n_ada, features_top_n_et, features_top_n_gb, features_top_n_dt], 
    features_importance = pd.concat([feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et, 
                                   feature_imp_sorted_gb, feature_imp_sorted_dt],ignore_index=True)
    return features_top_n , features_importance

(2) 依据我们筛选出的特征构建训练集和测试集


feature_to_pick = 30feature_top_n, feature_importance = get_top_n_features(titanic_train_data_X, titanic_train_data_Y, feature_to_pick)titanic_train_data_X = pd.DataFrame(titanic_train_data_X[feature_top_n])titanic_test_data_X = pd.DataFrame(titanic_test_data_X[feature_top_n])
Fitting 10 folds for each of 2 candidates, totalling 20 fits

 Top N Features Best RF Params:{'n_estimators': 500, 'min_samples_split': 3, 'max_depth': 20}
 Top N Features Best RF Score:0.822671156004
 Top N Features RF Train Score:0.979797979798
 Sample 10 Features from RF Classifier
 15      Name_length
 0               Age
 2              Fare
 7             Sex_0
 9           Title_0
 8             Sex_1
 27      Family_Size
 3            Pclass
 31    Ticket_Letter
 11          Title_2
 Name: feature, dtype: object
 Fitting 10 folds for each of 2 candidates, totalling 20 fits

 Top N Features Best Ada Params:{'n_estimators': 500, 'learning_rate': 0.01}
 Top N Features Best Ada Score:0.81593714927
 Top N Features Ada Train Score:0.820426487093
 Sample 10 Feature from Ada Classifier:
 9                    Title_0
 2                       Fare
 27               Family_Size
 7                      Sex_0
 3                     Pclass
 28    Family_Size_Category_0
 1                      Cabin
 8                      Sex_1
 15               Name_length
 0                        Age
 Name: feature, dtype: object
 Fitting 10 folds for each of 2 candidates, totalling 20 fits

 Top N Features Best ET Params:{'n_estimators': 500, 'min_samples_split': 4, 'max_depth': 20}
 Top N Features Best ET Score:0.828282828283
 Top N Features ET Train Score:0.971941638608
 Sample 10 Features from ET Classifier:
 9           Title_0
 8             Sex_1
 7             Sex_0
 15      Name_length
 0               Age
 2              Fare
 1             Cabin
 31    Ticket_Letter
 11          Title_2
 10          Title_1
 Name: feature, dtype: object
 Fitting 10 folds for each of 2 candidates, totalling 20 fits

 Top N Features Best GB Params:{'n_estimators': 500, 'learning_rate': 0.1, 'max_depth': 20}
 Top N Features Best GB Score:0.789001122334
 Top N Features GB Train Score:0.996632996633
 Sample 10 Feature from GB Classifier:
 0               Age
 2              Fare
 15      Name_length
 31    Ticket_Letter
 9           Title_0
 27      Family_Size
 23         Pclass_2
 3            Pclass
 18           Fare_2
 14          Title_5
 Name: feature, dtype: object
 Fitting 10 folds for each of 2 candidates, totalling 20 fits

 Top N Features Best DT Params:{'min_samples_split': 4, 'max_depth': 20}
 Top N Features Best DT Score:0.784511784512
 Top N Features DT Train Score:0.959595959596
 Sample 10 Features from DT Classifier:
 9           Title_0
 0               Age
 2              Fare
 15      Name_length
 27      Family_Size
 14          Title_5
 26         Pclass_5
 3            Pclass
 31    Ticket_Letter
 23         Pclass_2
 Name: feature, dtype: object


rf_feature_imp = feature_importance[:10]Ada_feature_imp = feature_importance[32:32+10].reset_index(drop=True)# make importances relative to max importancerf_feature_importance = 100.0 * (rf_feature_imp['importance'] / rf_feature_imp['importance'].max())Ada_feature_importance = 100.0 * (Ada_feature_imp['importance'] / Ada_feature_imp['importance'].max())# Get the indexes of all features over the importance thresholdrf_important_idx = np.where(rf_feature_importance)[0]Ada_important_idx = np.where(Ada_feature_importance)[0]# Adapted from Gradient Boosting regressionpos = np.arange(rf_important_idx.shape[0]) + .5plt.figure(1, figsize = (18, 8))plt.subplot(121)plt.barh(pos, rf_feature_importance[rf_important_idx][::-1])plt.yticks(pos, rf_feature_imp['feature'][::-1])plt.xlabel('Relative Importance')plt.title('RandomForest Feature Importance')plt.subplot(122)plt.barh(pos, Ada_feature_importance[Ada_important_idx][::-1])plt.yticks(pos, Ada_feature_imp['feature'][::-1])plt.xlabel('Relative Importance')plt.title('AdaBoost Feature Importance')plt.show()

(3) 模型融合(Model Ensemble)



Bagging 将多个模型,也就是多个基学习器的预测结果进行简单的加权平均或者投票。它的好处是可以并行地训练基学习器。Random Forest就用到了Bagging的思想。

(3-2): Boosting

Boosting 的思想有点像知错能改,每个基学习器是在上一个基学习器学习的基础上,对上一个基学习器的错误进行弥补。我们将会用到的 AdaBoost,Gradient Boost 就用到了这种思想。

(3-3): Stacking

Stacking是用新的次学习器去学习如何组合上一层的基学习器。如果把 Bagging 看作是多个基分类器的线性组合,那么Stacking就是多个基分类器的非线性组合。Stacking可以将学习器一层一层地堆砌起来,形成一个网状的结构。


(3-4): Blending

Blending 和 Stacking 很相似,但同时它可以防止信息泄露的问题。


这里我们使用了两层的模型融合,Level 1使用了:RandomForest、AdaBoost、ExtraTrees、GBDT、DecisionTree、KNN、SVM ,一共7个模型,Level 2使用了XGBoost使用第一层预测的结果作为特征对最终的结果进行预测。

Level 1:

Stacking框架是堆叠使用基础分类器的预测作为对二级模型的训练的输入。 然而,我们不能简单地在全部训练数据上训练基本模型,产生预测,输出用于第二层的训练。如果我们在Train Data上训练,然后在Train Data上预测,就会造成标签。为了避免标签,我们需要对每个基学习器使用K-fold,将K个模型对Valid Set的预测结果拼起来,作为下一层学习器的输入。


from sklearn.model_selection import KFold# Some useful parameters which will come in handy later onntrain = titanic_train_data_X.shape[0]ntest = titanic_test_data_X.shape[0]SEED = 0 # for reproducibilityNFOLDS = 7 # set folds for out-of-fold predictionkf = KFold(n_splits = NFOLDS, random_state=SEED, shuffle=False)def get_out_fold(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

构建不同的基学习器,这里我们使用了RandomForest、AdaBoost、ExtraTrees、GBDT、DecisionTree、KNN、SVM 七个基学习器:(这里的模型可以使用如上面的GridSearch方法对模型的超参数进行搜索选择)

from sklearn.neighbors import KNeighborsClassifierfrom sklearn.svm import SVCrf = RandomForestClassifier(n_estimators=500, warm_start=True, max_features='sqrt',max_depth=6, 
                            min_samples_split=3, min_samples_leaf=2, n_jobs=-1, verbose=0)ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, max_depth=8, min_samples_leaf=2, verbose=0)gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.008, min_samples_split=3, min_samples_leaf=2, max_depth=5, verbose=0)dt = DecisionTreeClassifier(max_depth=8)knn = KNeighborsClassifier(n_neighbors = 2)svm = SVC(kernel='linear', C=0.025)


# Create Numpy arrays of train, test and target (Survived) dataframes to feed into our modelsx_train = titanic_train_data_X.values # Creates an array of the train datax_test = titanic_test_data_X.values # Creats an array of the test datay_train = titanic_train_data_Y.values
# Create our OOF train and test predictions. These base results will be used as new featuresrf_oof_train, rf_oof_test = get_out_fold(rf, x_train, y_train, x_test) # Random Forestada_oof_train, ada_oof_test = get_out_fold(ada, x_train, y_train, x_test) # AdaBoost et_oof_train, et_oof_test = get_out_fold(et, x_train, y_train, x_test) # Extra Treesgb_oof_train, gb_oof_test = get_out_fold(gb, x_train, y_train, x_test) # Gradient Boostdt_oof_train, dt_oof_test = get_out_fold(dt, x_train, y_train, x_test) # Decision Treeknn_oof_train, knn_oof_test = get_out_fold(knn, x_train, y_train, x_test) # KNeighborssvm_oof_train, svm_oof_test = get_out_fold(svm, x_train, y_train, x_test) # Support Vectorprint("Training is complete")

(4) 预测并生成提交文件

Level 2:


x_train = np.concatenate((rf_oof_train, ada_oof_train, et_oof_train, gb_oof_train, dt_oof_train, knn_oof_train, svm_oof_train), axis=1)x_test = np.concatenate((rf_oof_test, ada_oof_test, et_oof_test, gb_oof_test, dt_oof_test, knn_oof_test, svm_oof_test), axis=1)
from xgboost import XGBClassifiergbm = XGBClassifier( n_estimators= 2000, max_depth= 4, min_child_weight= 2, gamma=0.9, subsample=0.8, 
                     colsample_bytree=0.8, objective= 'binary:logistic', nthread= -1, scale_pos_weight=1).fit(x_train, y_train)predictions = gbm.predict(x_test)
StackingSubmission = pd.DataFrame({'PassengerId': PassengerId, 'Survived': predictions})StackingSubmission.to_csv('StackingSubmission.csv',index=False,sep=',')

7. 验证:学习曲线




上面红线代表test error(Cross-validation error),蓝线代表train error。这里我们也可以把错误率替换为准确率,那么相应曲线的走向就应该是上下颠倒的,(score = 1 - error)。


  1. 左上角是最优情况,随着样本的增加,train error虽然有一定的增加吗,但是 test error却有很明显的降低;
  2. 右上角是最差情况,train error很大,模型并没有从特征中学习到什么,导致test error非常大,模型几乎无法预测数据,需要去寻找数据本身和训练阶段的原因;
  3. 左下角是high variance的情况,train error虽然较低,但是模型产生了过拟合,缺乏泛化能力,导致test error很高;
  4. 右下角是high bias的情况,train error很高,这时需要去调整模型的参数,减小train error。



from sklearn.learning_curve import learning_curvedef plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), verbose=0):
    """    Generate a simple plot of the test and traning learning curve.    Parameters    ----------    estimator : object type that implements the "fit" and "predict" methods        An object of that type which is cloned for each validation.    title : string        Title for the chart.    X : array-like, shape (n_samples, n_features)        Training vector, where n_samples is the number of samples and        n_features is the number of features.    y : array-like, shape (n_samples) or (n_samples, n_features), optional        Target relative to X for classification or regression;        None for unsupervised learning.    ylim : tuple, shape (ymin, ymax), optional        Defines minimum and maximum yvalues plotted.    cv : integer, cross-validation generator, optional        If an integer is passed, it is the number of folds (defaults to 3).        Specific cross-validation objects can be passed, see        sklearn.cross_validation module for the list of possible objects    n_jobs : integer, optional        Number of jobs to run in parallel (default 1).    """
    if ylim is not None:
    plt.xlabel("Training examples")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    return plt


X = x_trainY = y_train# RandomForestrf_parameters = {'n_jobs': -1, 'n_estimators': 500, 'warm_start': True, 'max_depth': 6, 'min_samples_leaf': 2, 
              'max_features' : 'sqrt','verbose': 0}# AdaBoostada_parameters = {'n_estimators':500, 'learning_rate':0.1}# ExtraTreeset_parameters = {'n_jobs': -1, 'n_estimators':500, 'max_depth': 8, 'min_samples_leaf': 2, 'verbose': 0}# GradientBoostinggb_parameters = {'n_estimators': 500, 'max_depth': 5, 'min_samples_leaf': 2, 'verbose': 0}# DecisionTreedt_parameters = {'max_depth':8}# KNeighborsknn_parameters = {'n_neighbors':2}# SVMsvm_parameters = {'kernel':'linear', 'C':0.025}# XGBgbm_parameters = {'n_estimators': 2000, 'max_depth': 4, 'min_child_weight': 2, 'gamma':0.9, 'subsample':0.8, 
               'colsample_bytree':0.8, 'objective': 'binary:logistic', 'nthread':-1, 'scale_pos_weight':1}
title = "Learning Curves"plot_learning_curve(RandomForestClassifier(**rf_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])plt.show()


8. 超参数调试


  • xgboost stacking:0.78468;
  • voting bagging :0.79904;


  • 特征工程:寻找更好的特征、删去影响较大的冗余特征;
  • 模型超参数调试:改进欠拟合或者过拟合的状态;
  • 改进模型框架:对于stacking框架的各层模型进行更好的选择;



  • 分分钟带你杀入Kaggle Top 1%(http://t.cn/RoXGHto)
  • 机器学习实战之Kaggle_Titanic预测(http://t.cn/RYd3Xlu)
  • 泰坦尼克号乘客数据分析(http://t.cn/RYd3HCK)
  • Kaggle泰坦尼克号生存模型——250个特征量的融合模型,排名8%(http://t.cn/RYd31l9)
  • Introduction to Ensembling/Stacking in Python(http://t.cn/ROI2HyK)
  • A Journey through Titanic(http://t.cn/RYd1mij)
