Skip to content

Commit 8b67cee

Browse files
committed
爬虫
1 parent 1393e78 commit 8b67cee

2 files changed

Lines changed: 36 additions & 8 deletions

File tree

beautifulsoup_test.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,15 @@
2020

2121
# soup = BeautifulSoup(html)
2222
soup = BeautifulSoup(html)
23-
23+
print soup.name
2424
print type(soup.head)
2525
print soup.head.get_text()
26+
print type(soup.head.get_text())
27+
print '----------------'
2628
print soup.head.string
29+
print type(soup.head.string)
30+
print soup.head.title.string
31+
print type(soup.head.title.string)
2732
# a_list = soup.select('a')
2833
# for a in a_list:
2934
# print a.get_text()

pachong1.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55

66
import urllib, urllib2
7-
from lxml import etree
7+
#from lxml import etree
88
page = 1
99
url = "https://www.qiushibaike.com/hot/page/"+str(page)
1010

@@ -15,11 +15,11 @@
1515
u_handle = urllib2.urlopen(request)
1616
ret = u_handle.read()
1717

18-
html = etree.HTML(ret)
19-
nodes = html.xpath('//div[@id="content-left"]/div')
20-
for node in nodes:
21-
print node.xpath('a/div/span/text()')
22-
print len(nodes)
18+
#html = etree.HTML(ret)
19+
#nodes = html.xpath('//div[@id="content-left"]/div')
20+
#for node in nodes:
21+
# print node.xpath('a/div/span/text()')
22+
#print len(nodes)
2323
except urllib2.HTTPError, e:
2424
print 'http请求 错误'
2525
print e.reason
@@ -31,4 +31,27 @@
3131
print 'no '
3232
else:
3333
print 'ok'
34-
# print ret
34+
# print ret
35+
36+
37+
38+
39+
if __name__ == '__main__':
40+
import urllib, urllib2
41+
from bs4 import BeautifulSoup
42+
page = 1
43+
url = "https://www.qiushibaike.com/text/page/"+str(page)
44+
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
45+
headers = {'User-Agent':user_agent}
46+
request = urllib2.Request(url,None,headers)
47+
u_handle = urllib2.urlopen(request)
48+
html_str = u_handle.read()
49+
soup = BeautifulSoup(html_str)
50+
divs = soup.select('div[id="content-left"] > div ')
51+
for x in divs:
52+
#print type(x)
53+
#print x
54+
55+
print '内容:',x.select('div[class="content"]')[0].get_text().strip()
56+
print '好笑:',x.select('div[class="stats"]')[0].select('span[class="stats-vote"]')[0].get_text()
57+
print '评论:',x.select('div[class="stats"]')[0].select('span[class="stats-comments"]')[0].a.get_text()

0 commit comments

Comments
 (0)