Skip to content

Commit 273b808

Browse files
committed
新增基于协程的淘宝关键词抓取
1 parent ed11feb commit 273b808

2 files changed

Lines changed: 2245 additions & 2104 deletions

File tree

fuck/taobao_gevent.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# encoding: utf-8
2+
__author__ = 'zhanghe'
3+
4+
import requests
5+
import time
6+
import json
7+
import os
8+
import tools.html
9+
import gevent
10+
from gevent import monkey
11+
monkey.patch_all()
12+
13+
header = {
14+
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
15+
}
16+
17+
s = requests.session()
18+
keywords_list = ['衣服'] # 爬虫待访问关键词列表
19+
keywords_visited_list = [] # 爬虫已访问关键词列表
20+
21+
22+
def get_keywords_list(keywords):
23+
"""
24+
获取关键词列表
25+
:param keywords:
26+
:return:
27+
"""
28+
# print keywords
29+
url = 'https://suggest.taobao.com/sug'
30+
payload = {
31+
'code': 'utf-8',
32+
'q': keywords,
33+
'_ksTS': str(int(1000*(time.time())))+'_2550',
34+
# 'callback': 'jsonp2551',
35+
'k': '1',
36+
'area': 'c2c',
37+
'bucketid': '19',
38+
}
39+
header['Host'] = 'suggest.taobao.com'
40+
header['Referer'] = 'https://top.taobao.com/index.php?spm=a1z5i.1.2.1.hUTg2J&topId=HOME'
41+
response = s.get(url, params=payload, headers=header)
42+
# print response.url
43+
content = response.text
44+
key_result_list = json.loads(content)['result']
45+
# print '新增关键词列表'
46+
# print json.dumps(key_result_list, indent=4, ensure_ascii=False)
47+
for i in key_result_list:
48+
item = tools.html.strip_html(i[0]) # 去除html标签
49+
if item is not None and item not in keywords_list and item not in keywords_visited_list: # 去重
50+
keywords_list.append(item)
51+
keywords_visited_list.append(keywords)
52+
keywords_list.remove(keywords)
53+
54+
55+
def save(result_list, file_name):
56+
"""
57+
保存文件
58+
:param result_list:
59+
:param file_name:
60+
:return:
61+
"""
62+
file_path = '../static/taobao/'
63+
if not os.path.isdir(file_path):
64+
os.mkdir(file_path)
65+
filename = file_path + file_name
66+
result_json = json.dumps(result_list, indent=4, ensure_ascii=False)
67+
with open(filename, 'wb') as f:
68+
f.write(result_json.encode('utf-8'))
69+
70+
71+
def time_statistics(start_time):
72+
"""
73+
计时统计
74+
"""
75+
print "待访问节点:%s" % len(keywords_list)
76+
print "已访问节点:%s" % len(keywords_visited_list)
77+
end_time = time.time()
78+
print "耗时:%0.2f S" % (end_time - start_time)
79+
print '--------------'
80+
81+
82+
def fuck():
83+
"""
84+
爬虫主程序
85+
"""
86+
start_time = time.time()
87+
while len(keywords_list) > 0:
88+
# get_keywords_list(keywords_list.pop(0))
89+
threads = [gevent.spawn(get_keywords_list, i) for i in keywords_list]
90+
gevent.joinall(threads)
91+
time_statistics(start_time)
92+
save(keywords_list, 'keywords_list.json')
93+
save(keywords_visited_list, 'keywords_visited_list.json')
94+
print '程序结束,打印结果'
95+
print json.dumps(keywords_list, indent=4, ensure_ascii=False)
96+
print json.dumps(keywords_visited_list, indent=4, ensure_ascii=False)
97+
98+
99+
if __name__ == '__main__':
100+
fuck()
101+
102+
103+
"""
104+
105+
使用协程前后耗时的对比:
106+
107+
待访问节点:0
108+
已访问节点:2493
109+
耗时:153.60 S
110+
--------------
111+
112+
--------------
113+
待访问节点:0
114+
已访问节点:2504
115+
耗时:130.58 S
116+
--------------
117+
118+
"""

0 commit comments

Comments
 (0)