-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
123 lines (104 loc) · 6.14 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import matplotlib as mpl
import matplotlib.pyplot as plt
import tensorflow
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from tensorflow.python.keras.backend import clear_session
from model import make_model
from preprocessing import *
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
def calculate_results(y_test, pred_test):
print("AUROC: ", roc_auc_score(y_test, pred_test))
print("AUPRC: ", average_precision_score(y_test, pred_test))
print(classification_report(y_test, pred_test.round()))
if __name__ == '__main__':
# tensorflow.compat.v1.disable_v2_behavior()
first_time = [True, True, True, True, True]
observation_point_list = [30, 60, 90, 120, 180, 270]
prediction_point_shift = 1.5
for observation_point in observation_point_list:
df = pd.read_csv("data/qlik_aug_final.csv", low_memory=False)
prediction_point = (observation_point * prediction_point_shift)
# df = exclude_inappropriate_patients(df, 0, observation_point - 60, prediction_point, 1) #
feature_df = filter_by_time_from_first_visit(df, 0, observation_point)
label_df = filter_by_time_from_first_visit(df, 0, prediction_point)
x_demo, x_temp, y = transform2seq(feature_df, label_df, df, prediction_point,
list(range(0, observation_point + 1, 15)))
# Normalization
scaler = MinMaxScaler()
scaler.fit(x_demo)
x_demo = scaler.transform(x_demo)
for k in range(x_temp.shape[2]):
min = x_temp[:, :, k].min()
max = x_temp[:, :, k].max()
feature_range = max - min
if feature_range != 0:
x_temp[:, :, k] = (x_temp[:, :, k] - min) / feature_range
# Count and print samples by their label
unique, counts = np.unique(y[:, 0], return_counts=True)
dict_attrition = dict(zip(unique, counts))
attrition_sample_ratio = dict_attrition[0] / dict_attrition[1]
print(dict_attrition)
unique, counts = np.unique(y[:, 1], return_counts=True)
dict_outcome = dict(zip(unique, counts))
outcome_sample_ratio = dict_outcome[0] / dict_outcome[1]
print(dict_outcome)
class_weights = {'outcome': {0: 1, 1: outcome_sample_ratio, 2: 0, 3: 0},
'attrition': {0: 1, 1: 1}}
# Split to train/test model
k = 0
for train_index, test_index in KFold(shuffle=True, random_state=73).split(x_demo):
k = k + 1
if k < 30:
x_demo_train = x_demo[train_index, :]
x_demo_test = x_demo[test_index, :]
x_temp_train = x_temp[train_index, :]
x_temp_test = x_temp[test_index, :, :]
y_train = y[train_index]
y_test = y[test_index]
# pretraining
model = make_model(x_demo.shape[1], x_temp.shape[2], x_temp.shape[1], tr_flag=True)
if not first_time[k - 1]:
model.load_weights('./weights/model_weights_fold' + str(k) + '.h5')
first_time[k - 1] = False
history = model.fit(x=[x_demo_train, x_temp_train], y=[y_train[:, 0], y_train[:, 1]],
validation_data=([x_demo_test, x_temp_test], [y_test[:, 0], y_test[:, 1]]),
epochs=50, verbose=0, batch_size=64,
class_weight=class_weights)
model.save_weights('./weights/model_weights_fold' + str(k) + '.h5')
# train *** fine tune
model = make_model(x_demo.shape[1], x_temp.shape[2], x_temp.shape[1], tr_flag=False)
model.load_weights('./weights/model_weights_fold' + str(k) + '.h5')
history = model.fit(x=[x_demo_train, x_temp_train], y=[y_train[:, 0], y_train[:, 1]],
validation_data=([x_demo_test, x_temp_test], [y_test[:, 0], y_test[:, 1]]),
epochs=10, verbose=0, batch_size=128,
class_weight=class_weights)
model.save_weights('./weights/model_weights_fold' + str(observation_point) + "----" + str(k) + 'finalllle.h5')
pred_test = model.predict([x_demo_test, x_temp_test])
pred_test_binary_0 = np.where(pred_test[0] > 0.5, True, False)
pred_test_binary_1 = np.where(pred_test[1] > 0.5, True, False)
y_test_binary = y_test.astype(bool)
out_str = str(observation_point) + " - " + str(k) + ","
p, r, f, s = precision_recall_fscore_support(y_test_binary[:, 0], pred_test_binary_0, average='binary')
out_str += str(roc_auc_score(y_test[:, 0], pred_test[0])) + "," # AUROC
out_str += str(average_precision_score(y_test[:, 0], pred_test[0])) + "," # AUPRC
out_str += str(p) + "," # Precision
out_str += str(r) + "," # Recall
out_str += str(f) + "," # F1
indexes = np.where(y_test[:, 1].reshape(y_test[:, 1].shape[0]) < 2)[0]
new_label = y_test[:, 1][indexes]
new_label_binary = y_test[:, 1][indexes].astype(bool)
new_pred = pred_test[1][indexes]
new_pred_binary = pred_test_binary_1[indexes]
p, r, f, s = precision_recall_fscore_support(new_label_binary, new_pred_binary, average='binary')
out_str += str(roc_auc_score(new_label, new_pred)) + "," # AUROC
out_str += str(average_precision_score(new_label, new_pred)) + "," # AUPRC
out_str += str(p) + "," # Precision
out_str += str(r) + "," # Recall
out_str += str(f) # F1
print(out_str)
clear_session()