[sklearn数据科学浅尝]kaggle泰坦尼克号幸存预测问题（入全球前10%）

问题描述

比赛地址 kaggle泰坦尼克号比赛说明

泰坦尼克号的沉没是历史上最著名的沉船之一。1912年4月15日，在她的首航中，泰坦尼克号在与冰山相撞后沉没，在2224名乘客和机组人员中造成1502人死亡。这场耸人听闻的悲剧震惊了国际社会，并促进了更严格的船舶安全规定产生。

造成海难失事的原因之一是乘客和机组人员没有足够的救生艇。尽管幸存下沉有一些运气因素，但有些人比其他人更容易生存，比如女人，孩子和上流社会。

在这个挑战中，我们要求您完成对哪些人可能存活的分析。特别是，我们要求您运用机器学习工具来预测哪些乘客在悲剧中幸存下来。

文件代码

import os
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection
excl = lambda x: os.popen(x).readlines()
%matplotlib inline
warnings.filterwarnings('ignore')

train = pd.read_csv('./titanic_datas/train.csv')
train.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

test = pd.read_csv('./titanic_datas/test.csv')
test.head()

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB

train.describe()

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

test.describe()

	PassengerId	Pclass	Age	SibSp	Parch	Fare
count	418.000000	418.000000	418.000000	418.000000	418.000000	418.000000
mean	1100.500000	2.265550	30.154603	0.447368	0.392344	35.619000
std	120.810458	0.841838	12.636666	0.896760	0.981429	55.840751
min	892.000000	1.000000	0.170000	0.000000	0.000000	0.000000
25%	996.250000	1.000000	23.000000	0.000000	0.000000	7.895800
50%	1100.500000	3.000000	29.699118	0.000000	0.000000	14.454200
75%	1204.750000	3.000000	35.750000	1.000000	0.000000	31.500000
max	1309.000000	3.000000	76.000000	8.000000	9.000000	512.329200

fare_mean = train["Fare"].mean()
test.loc[pd.isnull(test.Fare),'Fare'] = fare_mean 
embarked_mode = train['Embarked'].mode()
train.loc[pd.isnull(train.Embarked),['Embarked']] = embarked_mode[0]
age_mean = train['Age'].mean()
train.loc[pd.isnull(train.Age),['Age']] = age_mean
test.loc[pd.isnull(test.Age),['Age']] = age_mean

label = train['Survived']
train.drop('Survived',axis=1,inplace=True)

X_train,X_test,Y_train,Y_test = train_test_split(train,label,test_size = 0.3,random_state = 1)

X_train['Survived'] = Y_train
X_test['Survived'] = Y_test

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Sex', 'Survived', data=X_train, ax=axis1) 
sns.barplot('Sex', 'Survived', data=X_test, ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a23a577f0>

train['Sex'] = train['Sex'].apply(lambda x: 1 if x == 'male' else 0)
test['Sex'] = test['Sex'].apply(lambda x: 1 if x == 'male' else 0)

train = pd.get_dummies(data= train,columns=['Sex'])
test = pd.get_dummies(data= test,columns=['Sex'])

def Name_Title_Code(x):
    if x == 'Mr.':
        return 1
    if (x == 'Mrs.') or (x=='Ms.') or (x=='Lady.') or (x == 'Mlle.') or (x =='Mme'):
        return 2
    if x == 'Miss':
        return 3
    if x == 'Rev.':
        return 4
    return 5

X_train['Name_Title'] = X_train['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
X_test['Name_Title'] = X_test['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])

X_train.groupby('Name_Title')['Survived'].count()

Name_Title
Capt.        1
Col.         2
Don.         1
Dr.          4
Lady.        1
Major.       1
Master.     27
Miss.      126
Mlle.        1
Mme.         1
Mr.        365
Mrs.        87
Rev.         5
the          1
Name: Survived, dtype: int64

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Name_Title', 'Survived', data=X_train.sort_values('Name_Title'), ax=axis1) 
sns.barplot('Name_Title', 'Survived', data=X_test.sort_values('Name_Title'), ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a23e730f0>

def Name_Title_Code(x):
    if x == 'Mr.':
        return 1
    if (x == 'Mrs.') or (x=='Ms.') or (x=='Lady.') or (x == 'Mlle.') or (x =='Mme'):
        return 2
    if x == 'Miss':
        return 3
    if x == 'Rev.':
        return 4
    return 5

train['Name_Title'] = train['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
test['Name_Title'] = test['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])

train['Name_Title'] = train['Name_Title'].apply(Name_Title_Code)
test['Name_Title'] = test['Name_Title'].apply(Name_Title_Code)

train = pd.get_dummies(columns = ['Name_Title'], data = train)
test = pd.get_dummies(columns = ['Name_Title'], data = test)

train.head()

	PassengerId	Pclass	Name	Age	SibSp	Ticket	Fare	Cabin	Embarked	Sex_0	Sex_1	Name_Title_1	Name_Title_2	Name_Title_5
0	1	3	Braund, Mr. Owen Harris	22.0	1	A/5 21171	7.2500	NaN	S	0	1	1	0	0
1	2	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	38.0	1	PC 17599	71.2833	C85	C	1	0	0	1	0
2	3	3	Heikkinen, Miss. Laina	26.0	0	STON/O2. 3101282	7.9250	NaN	S	1	0	0	0	1
3	4	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	35.0	1	113803	53.1000	C123	S	1	0	0	1	0
4	5	3	Allen, Mr. William Henry	35.0	0	373450	8.0500	NaN	S	0	1	1	0	0

X_train['Name_len'] = X_train['Name'].apply(lambda x: len(x))
X_test['Name_len'] = X_test['Name'].apply(lambda x: len(x))

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(20,10))
sns.barplot('Name_len', 'Survived', data=X_train.sort_values(['Name_len']), ax=axis1) 
sns.barplot('Name_len', 'Survived', data=X_test.sort_values(['Name_len']), ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a24bdc4e0>

train['Name_len'] = train['Name'].apply(lambda x: len(x))
test['Name_len'] = test['Name'].apply(lambda x: len(x))

def Ticket_First_Let(x):
    return x[0]

X_train['Ticket_First_Letter'] = X_train['Ticket'].apply(Ticket_First_Let)
X_test['Ticket_First_Letter'] = X_test['Ticket'].apply(Ticket_First_Let)

X_train.groupby('Ticket_First_Letter')['Survived'].count()

Ticket_First_Letter
1     87
2    129
3    225
4     10
5      2
6      6
7      6
8      1
9      1
A     20
C     32
F      3
L      3
P     49
S     40
W      9
Name: Survived, dtype: int64

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Ticket_First_Letter', 'Survived', data=X_train.sort_values('Ticket_First_Letter'), ax=axis1) 
sns.barplot('Ticket_First_Letter', 'Survived', data=X_test.sort_values('Ticket_First_Letter'), ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a24dc7358>

def Ticket_First_Letter_Code(x):
    if (x == '1'):
        return 1
    if x == '3':
        return 2
    if x == '4':
        return 3
    if x == 'C':
        return 4
    if x == 'S':
        return 5
    if x == 'P':
        return 6
    if x == '6':
        return 7
    if x == '7':
        return 8
    if x == 'A':
        return 9
    if x == 'W':
        return 10
    return 11

train['Ticket_First_Letter'] = train['Ticket'].apply(Ticket_First_Let)
test['Ticket_First_Letter'] = test['Ticket'].apply(Ticket_First_Let)

train['Ticket_First_Letter'].unique()

array(['A', 'P', 'S', '1', '3', '2', 'C', '7', 'W', '4', 'F', 'L', '9',
       '6', '5', '8'], dtype=object)

test['Ticket_First_Letter'].unique()

array(['3', '2', '7', 'A', '6', 'W', 'S', 'P', 'C', '1', 'F', '4', '9',
       'L'], dtype=object)

train['Ticket_First_Letter'] = train['Ticket_First_Letter'].apply(Ticket_First_Letter_Code)
test['Ticket_First_Letter'] = test['Ticket_First_Letter'].apply(Ticket_First_Letter_Code)

X_train['Cabin'] = X_train['Cabin'].fillna('Missing')
X_test['Cabin'] = X_test['Cabin'].fillna('Missing')

def Cabin_First_Letter(x):
    if x == 'Missing':
        return 'XX'
    return x[0]

X_train['Cabin_First_Letter'] = X_train['Cabin'].apply(Cabin_First_Letter)
X_test['Cabin_First_Letter'] = X_test['Cabin'].apply(Cabin_First_Letter)

X_train.groupby('Cabin_First_Letter')['Survived'].count()

Cabin_First_Letter
A      12
B      28
C      41
D      21
E      22
F       8
G       3
XX    488
Name: Survived, dtype: int64

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Cabin_First_Letter', 'Survived', data=X_train.sort_values('Cabin_First_Letter'), ax=axis1) 
sns.barplot('Cabin_First_Letter', 'Survived', data=X_test.sort_values('Cabin_First_Letter'), ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a24ba4208>

def Cabin_First_Letter_Code(x):
    if x == 'XX':
        return 1
    if x == 'B':
        return 2
    if x == 'C':
        return 3
    if x == 'D':
        return 4     
    return 5

train['Cabin'] = train['Cabin'].fillna('Missing')
test['Cabin'] = test['Cabin'].fillna('Missing')

train['Cabin_First_Letter'] = train['Cabin'].apply(Cabin_First_Letter)
test['Cabin_First_Letter'] = test['Cabin'].apply(Cabin_First_Letter)

train['Cabin_First_Letter'] = train['Cabin_First_Letter'].apply(Cabin_First_Letter_Code)
test['Cabin_First_Letter'] = test['Cabin_First_Letter'].apply(Cabin_First_Letter_Code)

train = pd.get_dummies(columns = ['Cabin_First_Letter'], data = train) 
test = pd.get_dummies(columns = ['Cabin_First_Letter'], data = test)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Embarked', 'Survived', data=X_train.sort_values('Embarked'), ax=axis1) 
sns.barplot('Embarked', 'Survived', data=X_test.sort_values('Embarked'), ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a259bc6d8>

train = pd.get_dummies(train,columns = ['Embarked'])
test = pd.get_dummies(test,columns = ['Embarked'])

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('SibSp', 'Survived', data=X_train.sort_values('SibSp'), ax=axis1) 
sns.barplot('SibSp', 'Survived', data=X_test.sort_values('SibSp'), ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a25b7ea90>

X_train['Fam_Size'] = X_train['SibSp']  + X_train['Parch'] 
X_test['Fam_Size'] = X_test['SibSp']  + X_test['Parch']

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Fam_Size', 'Survived', data=X_train.sort_values('Parch'), ax=axis1) 
sns.barplot('Fam_Size', 'Survived', data=X_test.sort_values('Parch'), ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a25c67fd0>

def Family_feature(train, test):
    for i in [train, test]:
        i['Fam_Size'] = np.where((i['SibSp']+i['Parch']) == 0 , 'Solo',
                           np.where((i['SibSp']+i['Parch']) <= 3,'Nuclear', 'Big'))
        del i['SibSp']
        del i['Parch']
    return train, test 
train, test  = Family_feature(train, test)

train = pd.get_dummies(train,columns = ['Fam_Size']) 
test =  pd.get_dummies(test,columns = ['Fam_Size'])

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Pclass', 'Survived', data=X_train.sort_values('Pclass'), ax=axis1) 
sns.barplot('Pclass', 'Survived', data=X_test.sort_values('Pclass'), ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a25e84c50>

train['Pclass_1']  = np.int32(train['Pclass'] == 1)  
train['Pclass_2']  = np.int32(train['Pclass'] == 2)  
train['Pclass_3']  = np.int32(train['Pclass'] == 3)  

test['Pclass_1']  = np.int32(test['Pclass'] == 1)  
test['Pclass_2']  = np.int32(test['Pclass'] == 2)  
test['Pclass_3']  = np.int32(test['Pclass'] == 3)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5)) 
sns.distplot(X_train[X_train.Survived==1]['Age'].dropna().values, bins=range(0, 81, 6),color='red', ax=axis1) 
sns.distplot(X_train[X_train.Survived==0]['Age'].dropna().values, bins=range(0, 81, 6),color = 'blue', ax=axis1) 

sns.distplot(X_test[X_test.Survived==1]['Age'].dropna().values, bins=range(0, 81, 6),color='red', ax=axis2) 
sns.distplot(X_test[X_test.Survived==0]['Age'].dropna().values, bins=range(0, 81, 6),color = 'blue', ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a2614dd30>

train['Small_Age'] = np.int32(train['Age'] <= 5)  
train['Old_Age'] = np.int32(train['Age'] >= 65)  
train['Middle_Age'] = np.int32((train['Age'] >= 15) & (train['Age'] <= 25))  
 
test['Small_Age'] = np.int32(test['Age'] <= 5)  
test['Old_Age'] = np.int32(test['Age'] >= 65)  
test['Middle_Age'] = np.int32((test['Age'] >= 15) & (test['Age'] <= 25))

X_train['Fare'] = X_train['Fare'] + 1
X_test['Fare'] = X_test['Fare'] + 1

X_train['Fare'] = X_train['Fare'].apply(np.log)
X_test['Fare'] = X_test['Fare'].apply(np.log)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5)) 
sns.distplot(X_train[X_train.Survived==1]['Fare'].dropna().values, bins=range(0, 10, 1),color='red', ax=axis1) 
sns.distplot(X_train[X_train.Survived==0]['Fare'].dropna().values, bins=range(0, 10, 1),color = 'blue', ax=axis1) 

sns.distplot(X_test[X_test.Survived==1]['Fare'].dropna().values, bins=range(0, 10, 1),color='red', ax=axis2) 
sns.distplot(X_test[X_test.Survived==0]['Fare'].dropna().values, bins=range(0, 10, 1),color = 'blue', ax=axis2)

<matplotlib.axes._subplots.AxesSubplot at 0x1a2627ac50>

train['Fare'] = train['Fare'] + 1
test['Fare'] = test['Fare'] + 1

train['Fare'] = train['Fare'].apply(np.log)
test['Fare'] = test['Fare'].apply(np.log)

train['Fare_0_2'] = np.int32(train['Fare'] <= 2)
train['Fare_2_3'] = np.int32((train['Fare'] > 2) & (train['Fare'] <= 3) )
train['Fare_3_4'] = np.int32((train['Fare'] > 3) & (train['Fare'] <= 4) )
train['Fare_4_5'] = np.int32((train['Fare'] > 4) & (train['Fare'] <= 5)) 
train['Fare_5_'] = np.int32(train['Fare'] > 5)


test['Fare_0_2'] = np.int32(test['Fare'] <= 2)
test['Fare_2_3'] = np.int32((test['Fare'] > 2) & (test['Fare'] <= 3) )
test['Fare_3_4'] = np.int32((test['Fare'] > 3) & (test['Fare'] <= 4) )
test['Fare_4_5'] = np.int32((test['Fare'] > 4) & (test['Fare'] <= 5)) 
test['Fare_5_'] = np.int32(test['Fare'] > 5)

train.drop(['Ticket','PassengerId','Name','Age','Cabin','Pclass'],axis = 1, inplace=True)
test.drop( ['PassengerId','Ticket','Name','Age','Cabin','Pclass'],axis =1, inplace=True)

X_train_ = train.loc[X_train.index]
X_test_ = train.loc[X_test.index]

Y_train_ = label.loc[X_train.index]
Y_test_ = label.loc[X_test.index]

X_test_ = X_test_[X_train_.columns]

pd.set_option('display.max_columns',50)
train.head()

	Fare	Sex_0	Sex_1	Name_Title_1	Name_Title_2	Name_Title_5	Name_len	Ticket_First_Letter	Cabin_First_Letter_1	Cabin_First_Letter_3	Embarked_C	Embarked_S	Fam_Size_Nuclear	Fam_Size_Solo	Pclass_1	Pclass_3	Middle_Age	Fare_2_3	Fare_3_4	Fare_4_5
0	2.110213	0	1	1	0	0	23	9	1	0	0	1	1	0	0	1	1	1	0	0
1	4.280593	1	0	0	1	0	51	6	0	1	1	0	1	0	1	0	0	0	0	1
2	2.188856	1	0	0	0	1	22	5	1	0	0	1	0	1	0	1	0	1	0	0
3	3.990834	1	0	0	1	0	44	1	0	1	0	1	1	0	1	0	0	0	1	0
4	2.202765	0	1	1	0	0	24	2	1	0	0	1	0	1	0	1	0	1	0	0

test = test[train.columns]

rf_ = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
#                              max_depth=5,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',  
                             random_state=10,
                             n_jobs=-1) 
rf_.fit(X_train_,Y_train_) 
rf_.score(X_test_,Y_test_)

0.7910447761194029

rf_.fit(train,label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=16,
            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=-1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

pd.concat((pd.DataFrame(train.columns, columns = ['variable']), 
           pd.DataFrame(rf_.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

	variable	importance
1	Sex_0	0.136334
3	Name_Title_1	0.125036
2	Sex_1	0.118254
0	Fare	0.096483
7	Name_len	0.089186
6	Name_Title_5	0.055360
22	Pclass_3	0.050127
8	Ticket_First_Letter	0.045200
9	Cabin_First_Letter_1	0.034312
17	Fam_Size_Big	0.033951
4	Name_Title_2	0.033745
20	Pclass_1	0.022517
18	Fam_Size_Nuclear	0.021219
21	Pclass_2	0.015824
23	Small_Age	0.014996
27	Fare_2_3	0.013717
16	Embarked_S	0.012581
19	Fam_Size_Solo	0.011034
29	Fare_4_5	0.010546
14	Embarked_C	0.008645

excl("ls titanic_datas")

['gender_submission.csvn', 'test.csvn', 'train.csvn']

submit = pd.read_csv('./titanic_datas/gender_submission.csv')
submit.set_index('PassengerId',inplace=True)

res_rf = rf_.predict(test)
submit['Survived'] = res_rf
submit['Survived'] = submit['Survived'].apply(int)
submit.to_csv('./titanic_datas/submit.csv')

excl("ls titanic_datas")

['gender_submission.csvn', 'submit.csvn', 'test.csvn', 'train.csvn']

提交结果

Your Best Entry 
Your submission scored 0.81339