1+ # encoding: utf-8
2+ __author__ = 'zhanghe'
3+
4+ import requests
5+ import time
6+ import json
7+ import os
8+ import tools .html
9+ import gevent
10+ from gevent import monkey
11+ monkey .patch_all ()
12+
13+ header = {
14+ 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' ,
15+ }
16+
17+ s = requests .session ()
18+ keywords_list = ['衣服' ] # 爬虫待访问关键词列表
19+ keywords_visited_list = [] # 爬虫已访问关键词列表
20+
21+
22+ def get_keywords_list (keywords ):
23+ """
24+ 获取关键词列表
25+ :param keywords:
26+ :return:
27+ """
28+ # print keywords
29+ url = 'https://suggest.taobao.com/sug'
30+ payload = {
31+ 'code' : 'utf-8' ,
32+ 'q' : keywords ,
33+ '_ksTS' : str (int (1000 * (time .time ())))+ '_2550' ,
34+ # 'callback': 'jsonp2551',
35+ 'k' : '1' ,
36+ 'area' : 'c2c' ,
37+ 'bucketid' : '19' ,
38+ }
39+ header ['Host' ] = 'suggest.taobao.com'
40+ header ['Referer' ] = 'https://top.taobao.com/index.php?spm=a1z5i.1.2.1.hUTg2J&topId=HOME'
41+ response = s .get (url , params = payload , headers = header )
42+ # print response.url
43+ content = response .text
44+ key_result_list = json .loads (content )['result' ]
45+ # print '新增关键词列表'
46+ # print json.dumps(key_result_list, indent=4, ensure_ascii=False)
47+ for i in key_result_list :
48+ item = tools .html .strip_html (i [0 ]) # 去除html标签
49+ if item is not None and item not in keywords_list and item not in keywords_visited_list : # 去重
50+ keywords_list .append (item )
51+ keywords_visited_list .append (keywords )
52+ keywords_list .remove (keywords )
53+
54+
55+ def save (result_list , file_name ):
56+ """
57+ 保存文件
58+ :param result_list:
59+ :param file_name:
60+ :return:
61+ """
62+ file_path = '../static/taobao/'
63+ if not os .path .isdir (file_path ):
64+ os .mkdir (file_path )
65+ filename = file_path + file_name
66+ result_json = json .dumps (result_list , indent = 4 , ensure_ascii = False )
67+ with open (filename , 'wb' ) as f :
68+ f .write (result_json .encode ('utf-8' ))
69+
70+
71+ def time_statistics (start_time ):
72+ """
73+ 计时统计
74+ """
75+ print "待访问节点:%s" % len (keywords_list )
76+ print "已访问节点:%s" % len (keywords_visited_list )
77+ end_time = time .time ()
78+ print "耗时:%0.2f S" % (end_time - start_time )
79+ print '--------------'
80+
81+
82+ def fuck ():
83+ """
84+ 爬虫主程序
85+ """
86+ start_time = time .time ()
87+ while len (keywords_list ) > 0 :
88+ # get_keywords_list(keywords_list.pop(0))
89+ threads = [gevent .spawn (get_keywords_list , i ) for i in keywords_list ]
90+ gevent .joinall (threads )
91+ time_statistics (start_time )
92+ save (keywords_list , 'keywords_list.json' )
93+ save (keywords_visited_list , 'keywords_visited_list.json' )
94+ print '程序结束,打印结果'
95+ print json .dumps (keywords_list , indent = 4 , ensure_ascii = False )
96+ print json .dumps (keywords_visited_list , indent = 4 , ensure_ascii = False )
97+
98+
99+ if __name__ == '__main__' :
100+ fuck ()
101+
102+
103+ """
104+
105+ 使用协程前后耗时的对比:
106+
107+ 待访问节点:0
108+ 已访问节点:2493
109+ 耗时:153.60 S
110+ --------------
111+
112+ --------------
113+ 待访问节点:0
114+ 已访问节点:2504
115+ 耗时:130.58 S
116+ --------------
117+
118+ """
0 commit comments