Skip to content

Commit 58518a2

Browse files
committed
添加爬虫代码
1 parent 902d623 commit 58518a2

13 files changed

Lines changed: 229 additions & 6 deletions

File tree

myproject/myproject/middleware/ware.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,11 @@ def process_request(self,request):
1616
print request.META['REMOTE_ADDR']
1717
if not request.user.is_authenticated():
1818
return HttpResponseRedirect('/accounts/login/')
19-
if request.method == 'POST':
20-
if request.POST['username'] == 'admin':
21-
pass
19+
20+
# 导致后台无法添加内容
21+
# if request.method == 'POST':
22+
# if request.POST['username'] == 'admin':
23+
# pass
2224
# return http.HttpResponseForbidden(u'<h1>测试中间件 '+request.POST['username']+' is con\'t login Forbidden</h1>')
2325

2426
# 获取客户端IP
-94 Bytes
Binary file not shown.

myproject/online/urls.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
url(r'^api/$',views.api,name="api"),
2121
url(r'^login/$',views.login,name="login_url"),
2222
url(r'^logout/$',views.logout,name="logout_url"),
23+
url(r'^pachong/$',views.pachong,name="pachong"),
24+
url(r'^qsbk/$',views.qsbk_list,name="qsbk_list"),
2325
# url(r'^upload/(?P<path>.*)', 'django.views.static.serve', {'document_root': '/home/lmm/Documents/gitworkspace/python/myproject/upload'}),
2426

2527
# url(r'^thankyou/$',views.thankyou,name="thank_you"),

myproject/online/urls.pyc

109 Bytes
Binary file not shown.

myproject/online/views.py

Lines changed: 118 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
# django内置加密, 以及验证
1111
from django.contrib.auth.hashers import make_password, check_password
12-
from poster.models import User
12+
from poster.models import User, Qsbk
1313
import json
1414

1515
# 分页
@@ -18,6 +18,13 @@
1818
# 类视图
1919
from django.views.generic import ListView,DetailView,TemplateView
2020

21+
22+
# 导入爬虫的库开始
23+
import urllib, urllib2
24+
from bs4 import BeautifulSoup
25+
import os
26+
import threading
27+
2128
# Create your views here.
2229

2330
#用于注册的form表单
@@ -200,3 +207,113 @@ def api(request):
200207
return_php_data = {'name':'lmm','age':'34','height':'124','info':'请求成功'}
201208
data = json.dumps(return_php_data).encode('utf-8')
202209
return HttpResponse(data, content_type="application/json")
210+
211+
212+
213+
# ------------------------------------爬虫代码开始-------------------------------------
214+
# 写入文件
215+
def write_to_file(content,filename):
216+
with open(filename,'a') as f:
217+
f.write(content.encode('utf-8'))
218+
219+
220+
# 获取图片保存到本地
221+
def get_img(url):
222+
apath = os.path.join(os.path.abspath('..'),'media')
223+
224+
img_dir = apath+'/qsbk'
225+
226+
# 判断目录是否存在,不存在则创建
227+
if not os.path.isdir(img_dir):
228+
os.mkdir(img_dir)
229+
230+
user_agent = 'User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
231+
headers = {'User-Agent':user_agent}
232+
request = urllib2.Request('http:'+url,None,headers)
233+
u_handle = urllib2.urlopen(request)
234+
data = u_handle.read()
235+
236+
# 截取url末尾的图片名称
237+
img_name = os.path.basename(urllib.url2pathname(url))
238+
with open(img_dir+'/'+img_name,'wb') as f:
239+
f.write(data)
240+
241+
242+
# 简易爬取糗事百科的段子,图片地址,点赞数,评论数
243+
def test(page):
244+
myname = threading.current_thread().name
245+
print 'thread:',myname,'is start'
246+
247+
# url = "https://www.qiushibaike.com/text/page/"+str(page)
248+
url = "https://www.qiushibaike.com/imgrank/page/"+str(page)
249+
try:
250+
# 构造请求头部
251+
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
252+
headers = {'User-Agent':user_agent}
253+
request = urllib2.Request(url,None,headers)
254+
255+
# 发送请求
256+
u_handle = urllib2.urlopen(request)
257+
258+
# 获取html内容
259+
html_str = u_handle.read()
260+
261+
#构造文档树
262+
soup = BeautifulSoup(html_str)
263+
264+
# 选择节点
265+
divs = soup.select('div[id="content-left"] > div ')
266+
for x in divs:
267+
268+
content = x.select('div[class="content"]')[0].get_text().strip()
269+
img_url = x.select('div[class="thumb"]')[0].a.img.get('src')
270+
vote_num = x.select('div[class="stats"]')[0].select('span[class="stats-vote"]')[0].get_text().strip()
271+
comment_num = x.select('div[class="stats"]')[0].select('span[class="stats-comments"]')[0].a.get_text().strip()
272+
273+
# write_to_file(content+'\n','qiushibaike.txt')
274+
# write_to_file(img_url+'\n\n','qiushibaike.txt')
275+
pic_name = os.path.basename(urllib.url2pathname('http'+img_url))
276+
Qsbk.objects.create(content = content,pic_name = pic_name,pic_url = img_url)
277+
get_img(img_url)
278+
279+
except urllib2.HTTPError, e:
280+
print e.reason
281+
except urllib2.URLError, e:
282+
print e.reason
283+
else:
284+
print 'the page:'+str(page)+'is ok'
285+
286+
def pachong(request):
287+
threads = []
288+
for page in range(1, 5):
289+
290+
# 启动多线程,每个线程分别抓取不同的页
291+
t = threading.Thread(target=test, args=(page,))
292+
threads.append(t)
293+
# 单进程很慢,一页接着一页抓取
294+
# test(page)
295+
for i in threads:
296+
i.start()
297+
298+
for i in threads:
299+
i.join()
300+
print 'All is Done'
301+
302+
303+
def qsbk_list(request):
304+
305+
contact_list = Qsbk.objects.all()
306+
paginator = Paginator(contact_list, 3) # Show 25 contacts per page
307+
308+
page = request.GET.get('page')
309+
try:
310+
contacts = paginator.page(page)
311+
except PageNotAnInteger:
312+
# If page is not an integer, deliver first page.
313+
contacts = paginator.page(1)
314+
except EmptyPage:
315+
# If page is out of range (e.g. 9999), deliver last page of results.
316+
contacts = paginator.page(paginator.num_pages)
317+
# 返回信息
318+
return render_to_response('qsbk_list.html', {"contacts": contacts})
319+
# ------------------------------------爬虫代码结束-------------------------------------

myproject/online/views.pyc

3.51 KB
Binary file not shown.

myproject/poster/admin.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,9 @@ class CommentAdmin(admin.ModelAdmin):
2222
class UserAdmin(admin.ModelAdmin):
2323
list_display = ('id','username','password','headImg')
2424
search_fields = ['username']
25-
admin.site.register(User,UserAdmin)
25+
admin.site.register(User,UserAdmin)
26+
27+
class QsbkAdmin(admin.ModelAdmin):
28+
list_display = ('id','content','pic_name','created_at')
29+
30+
admin.site.register(Qsbk,QsbkAdmin)

myproject/poster/admin.pyc

337 Bytes
Binary file not shown.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# -*- coding: utf-8 -*-
2+
# Generated by Django 1.11.6 on 2017-10-20 21:35
3+
from __future__ import unicode_literals
4+
5+
from django.db import migrations, models
6+
7+
8+
class Migration(migrations.Migration):
9+
10+
dependencies = [
11+
('poster', '0004_user_headimg'),
12+
]
13+
14+
operations = [
15+
migrations.CreateModel(
16+
name='Qsbk',
17+
fields=[
18+
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
19+
('content', models.TextField(verbose_name='\u5185\u5bb9')),
20+
('pic_name', models.CharField(max_length=50, verbose_name='\u56fe\u7247\u540d\u79f0')),
21+
('pic_url', models.CharField(max_length=300, verbose_name='\u56fe\u7247\u540d\u79f0')),
22+
('created_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')),
23+
],
24+
),
25+
]
1.16 KB
Binary file not shown.

0 commit comments

Comments
 (0)