forked from ghmagazine/kagglebook
-
Notifications
You must be signed in to change notification settings - Fork 3
/
ch06-05-embedded.py
46 lines (40 loc) · 1.59 KB
/
ch06-05-embedded.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
import pandas as pd
# ---------------------------------
# Importance of random forest features
# ---------------------------------
# train_x is training data, train_y is target values
# Cannot deal with missing values so read data with missing values already imputed
train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
# ---------------------------------
from sklearn.ensemble import RandomForestClassifier
# Random forest
clf = RandomForestClassifier(n_estimators=10, random_state=71)
clf.fit(train_x, train_y)
fi = clf.feature_importances_
# Output in order to top importance
idx = np.argsort(fi)[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], fi[idx][:5]
print('random forest importance')
print(top_cols, top_importances)
# ---------------------------------
# Importance of xgboost features
# ---------------------------------
# train_x is training data, train_y is target values
train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
# ---------------------------------
import xgboost as xgb
# xgboost
dtrain = xgb.DMatrix(train_x, label=train_y)
params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
num_round = 50
model = xgb.train(params, dtrain, num_round)
# Output in order to top importance
fscore = model.get_score(importance_type='total_gain')
fscore = sorted([(k, v) for k, v in fscore.items()], key=lambda tpl: tpl[1], reverse=True)
print('xgboost importance')
print(fscore[:5])