@@ -34,6 +34,7 @@ def visit_page(url):
3434 """
3535 访问页面
3636 """
37+ header ['Host' ] = 'sh.ganji.com'
3738 response = s .get (url , headers = header )
3839 return response .text
3940
@@ -73,6 +74,9 @@ def get_info(file_html):
7374 key = item_list [0 ]
7475 value = item_list [1 ]
7576 item_dict [key ] = value
77+ # 获取公司联系方式
78+ contact_dict = get_contact (item_dict [u'公司编号' ])
79+ item_dict = dict (item_dict , ** contact_dict )
7680 yield item_dict
7781 print '单页共 %s 条记录' % len (tr_list )
7882
@@ -112,10 +116,43 @@ def get_info_5(file_html):
112116 key = item_list [0 ]
113117 value = item_list [1 ]
114118 item_dict [key ] = value
119+ # 获取公司联系方式
120+ contact_dict = get_contact (item_dict [u'公司编号' ])
121+ item_dict = dict (item_dict , ** contact_dict )
115122 yield item_dict
116123 print '单页共 %s 条记录' % len (tr_list )
117124
118125
126+ def get_contact (cid ):
127+ """
128+ 获取公司联系方式
129+ :param cid:
130+ :return:
131+ """
132+ wap_url = 'http://wap.ganji.com/gongsi/%s/?domain=sh' % str (cid )
133+ header ['Host' ] = 'wap.ganji.com'
134+ response = s .get (wap_url , headers = header )
135+ wap_html = response .content
136+ wap_pq = Pq (wap_html )
137+ contact_list = wap_pq ('.detail-describe' ).eq (1 )
138+ name_line = contact_list .find ('p' ).eq (0 ).text ()
139+ phone_line = contact_list .find ('p' ).eq (1 ).text ()
140+ contact_dict = {
141+ u'联系人' : '' ,
142+ u'联系电话' : ''
143+ }
144+ if name_line is not None :
145+ name_line_list = name_line .split (': ' )
146+ if len (name_line_list ) == 2 :
147+ contact_dict [u'联系人' ] = name_line_list [1 ]
148+ if phone_line is not None :
149+ phone_line_list = phone_line .split (': ' )
150+ if len (phone_line_list ) == 2 :
151+ contact_dict [u'联系电话' ] = phone_line_list [1 ].strip (' [拨打]' )
152+ # print contact_dict
153+ return contact_dict
154+
155+
119156def write_csv_head ():
120157 """
121158 创建csv文件标题
@@ -128,7 +165,7 @@ def write_csv_head():
128165 csv_file_name = file_path + 'ganji.csv'
129166 csv_file = file (csv_file_name , 'w' )
130167 writer = csv .writer (csv_file )
131- writer .writerow (['公司编号' , '公司名称' , '公司链接' , '职位名称' , '职位链接' , '薪资待遇' , '工作地点' , '工作经验' , '最低学历' , '招聘人数' , '公司规模' ])
168+ writer .writerow (['公司编号' , '公司名称' , '联系人' , '联系电话' , ' 公司链接' , '职位名称' , '职位链接' , '薪资待遇' , '工作地点' , '工作经验' , '最低学历' , '招聘人数' , '公司规模' ])
132169 csv_file .close ()
133170
134171
@@ -147,6 +184,8 @@ def save_csv(item_dict):
147184 item_tuple = (
148185 item_dict [u'公司编号' ],
149186 item_dict [u'公司名称' ],
187+ item_dict [u'联系人' ],
188+ item_dict [u'联系电话' ],
150189 'http://www.ganji.com/gongsi/%s/' % str (item_dict [u'公司编号' ]),
151190 item_dict [u'职位名称' ],
152191 item_dict [u'职位链接' ],
@@ -165,6 +204,7 @@ def fuck(max_page_num=10):
165204 """
166205 主程序,获取max_page_num个页面的数据,并写入csv文件
167206 """
207+ start_time = time .time ()
168208 write_csv_head ()
169209 for i in xrange (max_page_num ):
170210 if i > 0 :
@@ -182,6 +222,7 @@ def fuck(max_page_num=10):
182222 for item in get_info_5 (html_text ):
183223 save_csv (item )
184224 # time.sleep(8)
225+ print '程序耗时:%sS' % (time .time () - start_time )
185226
186227
187228if __name__ == "__main__" :
@@ -213,4 +254,14 @@ def fuck(max_page_num=10):
213254 <span class="phone-contact"><a href="/wapim/getMsgs/?userId=64219650">给他留言</a></span>
214255 </p>
215256</div>
257+
258+ 抓取文件统计
259+
260+ 程序耗时:486.772469044S
261+
262+ zhanghe@ubuntu:~/code/python$ du -h static/csv/ganji.csv
263+ 720K static/csv/ganji.csv
264+ zhanghe@ubuntu:~/code/python$ wc -l static/csv/ganji.csv
265+ 2965 static/csv/ganji.csv
266+
216267"""
0 commit comments