99
1010# django内置加密, 以及验证
1111from django .contrib .auth .hashers import make_password , check_password
12- from poster .models import User
12+ from poster .models import User , Qsbk
1313import json
1414
1515# 分页
1818# 类视图
1919from django .views .generic import ListView ,DetailView ,TemplateView
2020
21+
22+ # 导入爬虫的库开始
23+ import urllib , urllib2
24+ from bs4 import BeautifulSoup
25+ import os
26+ import threading
27+
2128# Create your views here.
2229
2330#用于注册的form表单
@@ -200,3 +207,113 @@ def api(request):
200207 return_php_data = {'name' :'lmm' ,'age' :'34' ,'height' :'124' ,'info' :'请求成功' }
201208 data = json .dumps (return_php_data ).encode ('utf-8' )
202209 return HttpResponse (data , content_type = "application/json" )
210+
211+
212+
213+ # ------------------------------------爬虫代码开始-------------------------------------
214+ # 写入文件
215+ def write_to_file (content ,filename ):
216+ with open (filename ,'a' ) as f :
217+ f .write (content .encode ('utf-8' ))
218+
219+
220+ # 获取图片保存到本地
221+ def get_img (url ):
222+ apath = os .path .join (os .path .abspath ('..' ),'media' )
223+
224+ img_dir = apath + '/qsbk'
225+
226+ # 判断目录是否存在,不存在则创建
227+ if not os .path .isdir (img_dir ):
228+ os .mkdir (img_dir )
229+
230+ user_agent = 'User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
231+ headers = {'User-Agent' :user_agent }
232+ request = urllib2 .Request ('http:' + url ,None ,headers )
233+ u_handle = urllib2 .urlopen (request )
234+ data = u_handle .read ()
235+
236+ # 截取url末尾的图片名称
237+ img_name = os .path .basename (urllib .url2pathname (url ))
238+ with open (img_dir + '/' + img_name ,'wb' ) as f :
239+ f .write (data )
240+
241+
242+ # 简易爬取糗事百科的段子,图片地址,点赞数,评论数
243+ def test (page ):
244+ myname = threading .current_thread ().name
245+ print 'thread:' ,myname ,'is start'
246+
247+ # url = "https://www.qiushibaike.com/text/page/"+str(page)
248+ url = "https://www.qiushibaike.com/imgrank/page/" + str (page )
249+ try :
250+ # 构造请求头部
251+ user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
252+ headers = {'User-Agent' :user_agent }
253+ request = urllib2 .Request (url ,None ,headers )
254+
255+ # 发送请求
256+ u_handle = urllib2 .urlopen (request )
257+
258+ # 获取html内容
259+ html_str = u_handle .read ()
260+
261+ #构造文档树
262+ soup = BeautifulSoup (html_str )
263+
264+ # 选择节点
265+ divs = soup .select ('div[id="content-left"] > div ' )
266+ for x in divs :
267+
268+ content = x .select ('div[class="content"]' )[0 ].get_text ().strip ()
269+ img_url = x .select ('div[class="thumb"]' )[0 ].a .img .get ('src' )
270+ vote_num = x .select ('div[class="stats"]' )[0 ].select ('span[class="stats-vote"]' )[0 ].get_text ().strip ()
271+ comment_num = x .select ('div[class="stats"]' )[0 ].select ('span[class="stats-comments"]' )[0 ].a .get_text ().strip ()
272+
273+ # write_to_file(content+'\n','qiushibaike.txt')
274+ # write_to_file(img_url+'\n\n','qiushibaike.txt')
275+ pic_name = os .path .basename (urllib .url2pathname ('http' + img_url ))
276+ Qsbk .objects .create (content = content ,pic_name = pic_name ,pic_url = img_url )
277+ get_img (img_url )
278+
279+ except urllib2 .HTTPError , e :
280+ print e .reason
281+ except urllib2 .URLError , e :
282+ print e .reason
283+ else :
284+ print 'the page:' + str (page )+ 'is ok'
285+
286+ def pachong (request ):
287+ threads = []
288+ for page in range (1 , 5 ):
289+
290+ # 启动多线程,每个线程分别抓取不同的页
291+ t = threading .Thread (target = test , args = (page ,))
292+ threads .append (t )
293+ # 单进程很慢,一页接着一页抓取
294+ # test(page)
295+ for i in threads :
296+ i .start ()
297+
298+ for i in threads :
299+ i .join ()
300+ print 'All is Done'
301+
302+
303+ def qsbk_list (request ):
304+
305+ contact_list = Qsbk .objects .all ()
306+ paginator = Paginator (contact_list , 3 ) # Show 25 contacts per page
307+
308+ page = request .GET .get ('page' )
309+ try :
310+ contacts = paginator .page (page )
311+ except PageNotAnInteger :
312+ # If page is not an integer, deliver first page.
313+ contacts = paginator .page (1 )
314+ except EmptyPage :
315+ # If page is out of range (e.g. 9999), deliver last page of results.
316+ contacts = paginator .page (paginator .num_pages )
317+ # 返回信息
318+ return render_to_response ('qsbk_list.html' , {"contacts" : contacts })
319+ # ------------------------------------爬虫代码结束-------------------------------------
0 commit comments