-
Notifications
You must be signed in to change notification settings - Fork 84
/
clean_b.py
91 lines (72 loc) · 2.62 KB
/
clean_b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from tool import *
data_path = './input/'
d = {89950166: 1, 89950167: 2, 89950168: 5, 90063345: 0, 90109916: 4,
90155946: 8, 99999825: 10, 99999826: 7, 99999827: 6, 99999828: 3, 99999830: 9}
rd = {0: 90063345, 1: 89950166, 2: 89950167, 3: 99999828, 4: 90109916,
5: 89950168, 6: 99999827, 7: 99999826, 8: 90155946, 9: 99999830, 10: 99999825}
def astype(x,t):
try:
return t(x)
except:
return np.nan
def have_0(x):
try:
r = x.split('.')[1][-1]
return 0 if r=='0' else 1
except:
return 1
str_dict = {'1_total_fee': 'str',
'2_total_fee': 'str',
'3_total_fee': 'str',
'4_total_fee': 'str',
'last_month_traffic': 'str',
'local_caller_time': 'str',
'local_trafffic_month': 'str',
'month_traffic': 'str',
'pay_num': 'str',
'service1_caller_time': 'str',
'service2_caller_time': 'str'}
train = pd.read_csv(data_path + 'train.csv',dtype=str_dict)
test = pd.read_csv(data_path + 'test.csv',dtype=str_dict)
train['label'] = train['current_service'].map(d)
have_0_c = ['1_total_fee',
'2_total_fee',
'3_total_fee',
'4_total_fee',
'month_traffic',
'last_month_traffic',
'local_trafffic_month',
'local_caller_time',
'service1_caller_time',
'service2_caller_time',
'pay_num']
def deal(data):
for c in have_0_c:
data['have_0_{}'.format(c)] = data[c].apply(have_0)
try:
data[c] = data[c].astype(float)
except:
pass
data['2_total_fee'] = data['2_total_fee'].apply(lambda x: astype(x,float))
data['3_total_fee'] = data['3_total_fee'].apply(lambda x: astype(x,float))
data['age'] = data['age'].apply(lambda x: astype(x,int))
data['gender'] = data['gender'].apply(lambda x: astype(x,int))
data.loc[data['age']==0,'age'] = np.nan
data.loc[data['1_total_fee'] < 0, '1_total_fee'] = np.nan
data.loc[data['2_total_fee'] < 0, '2_total_fee'] = np.nan
data.loc[data['3_total_fee'] < 0, '3_total_fee'] = np.nan
data.loc[data['4_total_fee'] < 0, '4_total_fee'] = np.nan
for c in [
'1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
'month_traffic', 'last_month_traffic', 'local_trafffic_month',
'local_caller_time', 'service1_caller_time', 'service2_caller_time',
'many_over_bill', 'contract_type', 'contract_time', 'pay_num', ]:
data[c] = data[c].round(4)
return data
train = deal(train)
train = train[train['current_service'] != 999999]
test = deal(test)
data_path = 'data/b/'
train.to_csv(data_path + 'train_new.csv',index=False)
test.to_csv(data_path + 'test_new.csv',index=False)
#print('预处理完成')