Skip to content

Commit aa2a1e6

Browse files
committed
beautifulsoup 测试
1 parent 5b22c32 commit aa2a1e6

1 file changed

Lines changed: 21 additions & 10 deletions

File tree

beautifulsoup_test.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,20 @@
22
# -*- coding: utf-8 -*-
33
# BeautifulSoup 爬虫练习
44

5-
from bs4 import BeautifulSoup
5+
# 安装BeautifulSoup
6+
# 方法一:easy_install beautifulsoup4
7+
# 方法二:pip install beautifulsoup4
8+
69

10+
# Beautiful Soup支持Python标准库中的HTML解析器(html.parser),
11+
# 还支持一些第三方的解析器(lxml,html5lib等),如果我们不安装它,则 Python 会使用 Python默认的解析器,lxml 解析器更加强大,速度更快,推荐安装
712

13+
14+
# 导入模块
15+
from bs4 import BeautifulSoup
16+
import urllib
17+
18+
# 测试html
819
html = """
920
<html><head><title>The Dormouse's story</title></head>
1021
<body>
@@ -17,8 +28,7 @@
1728
<p class="story">...</p>
1829
"""
1930

20-
21-
# soup = BeautifulSoup(html)
31+
# soup = BeautifulSoup(html,'lxml)
2232
soup = BeautifulSoup(html)
2333
print soup.name
2434
print type(soup.head)
@@ -57,21 +67,17 @@
5767
# print soup.a.string
5868
# if type(soup.a.string) == 'bs4.element.Comment':
5969
# print type(soup.a.string)
60-
6170
# for content in soup.p.contents:
6271
# print content
63-
6472
# for child in soup.body.children:
6573
# print child
66-
6774
# for child in soup.descendants:
6875
# print child
69-
7076
# for str in soup.stripped_strings :
7177
# print str
72-
if __name__ == '__main__':
73-
# pass
74-
import urllib
78+
79+
# 小小的测试
80+
def test():
7581
url_handle = urllib.urlopen('https://www.python.org/events/python-events/')
7682
html = url_handle.read()
7783
soup = BeautifulSoup(html)
@@ -81,4 +87,9 @@
8187
print '会议:',x.h3.get_text()
8288
print '时间:',x.p.time.get_text()
8389
print '内容',x.p.select('span[class="event-location"]')[0].get_text()
90+
91+
92+
if __name__ == '__main__':
93+
test()
94+
8495

0 commit comments

Comments
 (0)