from sklearn import ensemble
from sklearn import preprocessing
import pandas as pd
train = pd.read_csv('train.csv')
# データ加工
train.Age = train.Age.fillna(train.Age.mean())
le_sex = preprocessing.LabelEncoder()
train.Sex = le_sex.fit_transform(train.Sex)
print train.Sex.head()
le_embarked = preprocessing.LabelEncoder()
train.Embarked = le_embarked.fit_transform(train.Embarked)
print train.Embarked.head()
0 1 1 0 2 0 3 0 4 1 Name: Sex, dtype: int64 0 3 1 1 2 3 3 3 4 3 Name: Embarked, dtype: int64
y = train['Survived']
X = train[['Age', 'Sex', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked']]
rf = ensemble.RandomForestClassifier()
rf.fit(X, y)
RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0)
py = rf.predict(X)
print rf.score(X, y)
0.967452300786
pd.crosstab(py, y)
Survived | 0 | 1 |
---|---|---|
row_0 | ||
0 | 539 | 19 |
1 | 10 | 323 |
from sklearn.metrics import confusion_matrix
confusion_matrix(py, y)
array([[539, 19], [ 10, 323]])
from sklearn.metrics import accuracy_score
accuracy_score(py, y)
0.96745230078563416