BeautifulSoupã使ã£ã¦Googleã®è¡¨ç¤ºé ä½ã調ã¹ã
BeautifulSoupã¯HTML解æç¨ã®ã©ã¤ãã©ãªã§ããhtmllib.HTMLParserãHTMLPaprser.HTMLParserã¨éããæ£ãããªãHTMLãæ±ããããã§ããããã使ã£ã¦Googleã®è¡¨ç¤ºé ä½ã調ã¹ãã¹ã¯ãªãããæ¸ãã¦ã¿ã¾ããã
#!python # vim:fileencoding=utf-8 import re import sys import time import urllib2 import urlparse from BeautifulSoup import BeautifulSoup g_url = "http://www.google.co.jp/search?hl=ja&num=100&q=" next_text = u"次ã¸" interval = 3 client_encoding = "cp932" server_encoding = "utf-8" try: keyword, url = sys.argv[1:] except ValueError: sys.exit() print "keyword:", keyword print "url :", url opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] keyword = keyword.decode(client_encoding).encode(server_encoding) search_url = g_url + urllib2.quote(keyword) rank = 0 page = 1 while search_url: print "\rpage : %d" % page, sys.stdout.flush() html = opener.open(search_url).read() soup = BeautifulSoup(html) # HTMLã解æ # Unicodeæååã«å¤æããã # ãã£ãã·ã¥ãé¢é£ãã¼ã¸ã¸ã®ãªã³ã¯ãé¤å¤ãããã®ã§ # classå±æ§ã"l"ã®Aã¿ã°ã ããåå¾ for a in soup.findAll("a", {"class":"l"}): rank += 1 href = a["href"] # hrefå±æ§ã§URLãåå¾ # æå®ãããURLãè¦ã¤ããã°ãçµæã表示ãã¦çµäº if href.startswith(url): # Aã¿ã°ã§å²ã¾ããæååãåå¾ title = "".join([c.string for c in a.contents]) print "\nrank :", rank print "href :", href print "title:", title search_url = "" break # æå®ããURLãè¦ã¤ãããªããã°ã次ã®æ¤ç´¢çµæã調ã¹ã else: # 次ã®æ¤ç´¢çµæã¸ã®ãªã³ã¯ãåå¾ next = soup.find(lambda tag: tag.name=="a" and tag.b and tag.b.string==next_text) if next: search_url = urlparse.urljoin(g_url, next["href"]) page += 1 time.sleep(interval) else: # 次ã¸ã®ãªã³ã¯ãè¦ã¤ãããªããã°çµäº print u"åå¤ã§ã" search_url = ""
$ python test.py "python ã¤ã³ã¹ãã¼ã«" http://www.hlj.com/~tanoue/ keyword: python ã¤ã³ã¹ãã¼ã« url : http://www.hlj.com/~tanoue/ page : 2 rank : 158 href : http://www.hlj.com/~tanoue/Python/Mac/mpy00.html title: Mac de Python