forked from ghmagazine/kagglebook
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathch06-04-filter.py
81 lines (66 loc) · 2.64 KB
/
ch06-04-filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# ---------------------------------
# Prepare the data etc.
# ----------------------------------
import numpy as np
import pandas as pd
# train_x is the training data, train_y is the target values, and test_x is the test data
# stored in pandas DataFrames and Series (numpy arrays also used)
train = pd.read_csv('../input/sample-data/train_preprocessed_onehot.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('../input/sample-data/test_preprocessed_onehot.csv')
# ---------------------------------
# Use argsort() to do index sort
# ---------------------------------
# Arrays can be ordered using index sort into ascending and descending order with argsort()
ary = np.array([10, 20, 30, 0])
idx = ary.argsort()
print(idx) # Ascending order - [3 0 1 2]
print(idx[::-1]) # Descending order - [2 1 0 3]
print(ary[idx[::-1][:3]]) # Output best three - [30, 20, 10]
# ---------------------------------
# Correlation coefficient
# ---------------------------------
import scipy.stats as st
# Correlation coefficient
corrs = []
for c in train_x.columns:
corr = np.corrcoef(train_x[c], train_y)[0, 1]
corrs.append(corr)
corrs = np.array(corrs)
# Spearman's rank correlation coefficient
corrs_sp = []
for c in train_x.columns:
corr_sp = st.spearmanr(train_x[c], train_y).correlation
corrs_sp.append(corr_sp)
corrs_sp = np.array(corrs_sp)
# Output in order to top importance (maximum of top 5)
# Using np.argsort(), you can get the indices of the ordered values
idx = np.argsort(np.abs(corrs))[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
print(top_cols, top_importances)
idx2 = np.argsort(np.abs(corrs_sp))[::-1]
top_cols2, top_importances2 = train_x.columns.values[idx][:5], corrs_sp[idx][:5]
print(top_cols2, top_importances2)
# ---------------------------------
# Chi-square statistic
# ---------------------------------
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
# Chi-square statistic
x = MinMaxScaler().fit_transform(train_x)
c2, _ = chi2(x, train_y)
# Output in order to top importance (maximum of top 5)
idx = np.argsort(c2)[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
print(top_cols, top_importances)
# ---------------------------------
# Mutual information
# ---------------------------------
from sklearn.feature_selection import mutual_info_classif
# Mutual information
mi = mutual_info_classif(train_x, train_y)
# Output in order to top importance (maximum of top 5)
idx = np.argsort(mi)[::-1]
top_cols, top_importances = train_x.columns.values[idx][:5], corrs[idx][:5]
print(top_cols, top_importances)