爬虫

damnit1989 · damnit1989 · commit 8b67ceee7826 · 2017-10-19T11:19:16.000+08:00
diff --git a/beautifulsoup_test.py b/beautifulsoup_test.py
@@ -20,10 +20,15 @@
 
 # soup = BeautifulSoup(html)
 soup = BeautifulSoup(html)
-
+print soup.name
 print type(soup.head)
 print soup.head.get_text()
+print type(soup.head.get_text())
+print '----------------'
 print soup.head.string
+print type(soup.head.string)
+print soup.head.title.string
+print type(soup.head.title.string)
 # a_list = soup.select('a')
 # for a in a_list:
     # print a.get_text()
diff --git a/pachong1.py b/pachong1.py
@@ -4,7 +4,7 @@
 
 
 import urllib, urllib2
-from lxml import etree
+#from lxml import etree
 page = 1
 url = "https://www.qiushibaike.com/hot/page/"+str(page)
 
@@ -15,11 +15,11 @@
     u_handle = urllib2.urlopen(request)
     ret = u_handle.read()
 
-    html = etree.HTML(ret)
-    nodes = html.xpath('//div[@id="content-left"]/div')
-    for node in nodes:
-        print node.xpath('a/div/span/text()')
-    print len(nodes)    
+    #html = etree.HTML(ret)
+    #nodes = html.xpath('//div[@id="content-left"]/div')
+    #for node in nodes:
+    #    print node.xpath('a/div/span/text()')
+    #print len(nodes)
 except urllib2.HTTPError, e:
     print 'http&#35831;&#27714; &#38169;&#35823;'
     print e.reason
@@ -31,4 +31,27 @@
     print 'no '
 else:
     print 'ok'
-    # print ret
+    # print ret
+
+
+
+
+if __name__ == '__main__':
+    import urllib, urllib2
+    from bs4 import BeautifulSoup
+    page = 1
+    url = "https://www.qiushibaike.com/text/page/"+str(page)
+    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
+    headers = {'User-Agent':user_agent}
+    request = urllib2.Request(url,None,headers)
+    u_handle = urllib2.urlopen(request)
+    html_str = u_handle.read()
+    soup = BeautifulSoup(html_str)
+    divs = soup.select('div[id="content-left"] > div ')
+    for x in divs:
+        #print type(x)
+        #print x
+
+        print '&#20869;&#23481;&#65306;',x.select('div[class="content"]')[0].get_text().strip()
+        print '&#22909;&#31505;&#65306;',x.select('div[class="stats"]')[0].select('span[class="stats-vote"]')[0].get_text()
+        print '&#35780;&#35770;&#65306;',x.select('div[class="stats"]')[0].select('span[class="stats-comments"]')[0].a.get_text()