File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 2020
2121# soup = BeautifulSoup(html)
2222soup = BeautifulSoup (html )
23-
23+ print soup . name
2424print type (soup .head )
2525print soup .head .get_text ()
26+ print type (soup .head .get_text ())
27+ print '----------------'
2628print soup .head .string
29+ print type (soup .head .string )
30+ print soup .head .title .string
31+ print type (soup .head .title .string )
2732# a_list = soup.select('a')
2833# for a in a_list:
2934 # print a.get_text()
Original file line number Diff line number Diff line change 44
55
66import urllib , urllib2
7- from lxml import etree
7+ # from lxml import etree
88page = 1
99url = "https://www.qiushibaike.com/hot/page/" + str (page )
1010
1515 u_handle = urllib2 .urlopen (request )
1616 ret = u_handle .read ()
1717
18- html = etree .HTML (ret )
19- nodes = html .xpath ('//div[@id="content-left"]/div' )
20- for node in nodes :
21- print node .xpath ('a/div/span/text()' )
22- print len (nodes )
18+ # html = etree.HTML(ret)
19+ # nodes = html.xpath('//div[@id="content-left"]/div')
20+ # for node in nodes:
21+ # print node.xpath('a/div/span/text()')
22+ # print len(nodes)
2323except urllib2 .HTTPError , e :
2424 print 'http请求 错误'
2525 print e .reason
3131 print 'no '
3232else :
3333 print 'ok'
34- # print ret
34+ # print ret
35+
36+
37+
38+
39+ if __name__ == '__main__' :
40+ import urllib , urllib2
41+ from bs4 import BeautifulSoup
42+ page = 1
43+ url = "https://www.qiushibaike.com/text/page/" + str (page )
44+ user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
45+ headers = {'User-Agent' :user_agent }
46+ request = urllib2 .Request (url ,None ,headers )
47+ u_handle = urllib2 .urlopen (request )
48+ html_str = u_handle .read ()
49+ soup = BeautifulSoup (html_str )
50+ divs = soup .select ('div[id="content-left"] > div ' )
51+ for x in divs :
52+ #print type(x)
53+ #print x
54+
55+ print '内容:' ,x .select ('div[class="content"]' )[0 ].get_text ().strip ()
56+ print '好笑:' ,x .select ('div[class="stats"]' )[0 ].select ('span[class="stats-vote"]' )[0 ].get_text ()
57+ print '评论:' ,x .select ('div[class="stats"]' )[0 ].select ('span[class="stats-comments"]' )[0 ].a .get_text ()
You can’t perform that action at this time.
0 commit comments