BeautifulSoupã‚’ä½¿ã£ã¦Googleã®è¡¨ç¤ºé †ä½ã‚’èª¿ã¹ã‚‹

BeautifulSoupã¯HTMLè§£æžç”¨ã®ãƒ©ã‚¤ãƒ–ãƒ©ãƒªã§ã™ã€‚htmllib.HTMLParserã‚„HTMLPaprser.HTMLParserã¨é•ã„ã€æ£ã—ããªã„HTMLã‚‚æ‰±ãˆã‚‹ã‚ˆã†ã§ã™ã€‚ã“ã‚Œã‚’ä½¿ã£ã¦Googleã®è¡¨ç¤ºé †ä½ã‚’èª¿ã¹ã‚‹ã‚¹ã‚¯ãƒªãƒ—ãƒˆã‚’æ›¸ã„ã¦ã¿ã¾ã—ãŸã€‚

#!python
# vim:fileencoding=utf-8

import re
import sys
import time
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup

g_url = "http://www.google.co.jp/search?hl=ja&num=100&q="
next_text = u"æ¬¡ã¸"
interval = 3
client_encoding = "cp932"
server_encoding = "utf-8"

try:
    keyword, url = sys.argv[1:]
except ValueError:
    sys.exit()
print "keyword:", keyword
print "url    :", url

opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]

keyword = keyword.decode(client_encoding).encode(server_encoding)
search_url = g_url + urllib2.quote(keyword)
rank = 0
page = 1
while search_url:
    print "\rpage : %d" % page,
    sys.stdout.flush()
    html = opener.open(search_url).read()
    soup = BeautifulSoup(html) # HTMLã‚’è§£æž
                               # Unicodeæ–‡å—åˆ—ã«å¤‰æ›ã•ã‚Œã‚‹
    
    # ã‚ãƒ£ãƒƒã‚·ãƒ¥ã‚„é–¢é€£ãƒšãƒ¼ã‚¸ã¸ã®ãƒªãƒ³ã‚¯ã‚’é™¤å¤–ã—ãŸã„ã®ã§
    # classå±žæ€§ãŒ"l"ã®Aã‚¿ã‚°ã ã‘ã‚’å–å¾—
    for a in soup.findAll("a", {"class":"l"}):
        rank += 1
        href = a["href"] # hrefå±žæ€§ã§URLã‚’å–å¾—
        # æŒ‡å®šã•ã‚ŒãŸURLãŒè¦‹ã¤ã‹ã‚Œã°ã€çµæžœã‚’è¡¨ç¤ºã—ã¦çµ‚äº†
        if href.startswith(url):
            # Aã‚¿ã‚°ã§å›²ã¾ã‚ŒãŸæ–‡å—åˆ—ã‚’å–å¾—
            title = "".join([c.string for c in a.contents])
            print "\nrank :", rank
            print "href :", href
            print "title:", title
            search_url = ""
            break

    # æŒ‡å®šã—ãŸURLãŒè¦‹ã¤ã‹ã‚‰ãªã‘ã‚Œã°ã€æ¬¡ã®æ¤œç´¢çµæžœã‚’èª¿ã¹ã‚‹
    else:
        # æ¬¡ã®æ¤œç´¢çµæžœã¸ã®ãƒªãƒ³ã‚¯ã‚’å–å¾—
        next = soup.find(lambda tag: tag.name=="a" and
                tag.b and tag.b.string==next_text)
        if next:
            search_url = urlparse.urljoin(g_url, next["href"])
            page += 1
            time.sleep(interval)
        else: # æ¬¡ã¸ã®ãƒªãƒ³ã‚¯ãŒè¦‹ã¤ã‹ã‚‰ãªã‘ã‚Œã°çµ‚äº†
            print u"åœå¤–ã§ã™"
            search_url = ""

$ python test.py "python ã‚¤ãƒ³ã‚¹ãƒˆãƒ¼ãƒ«" http://www.hlj.com/~tanoue/            
keyword: python ã‚¤ãƒ³ã‚¹ãƒˆãƒ¼ãƒ«
url    : http://www.hlj.com/~tanoue/
page : 2
rank : 158
href : http://www.hlj.com/~tanoue/Python/Mac/mpy00.html
title: Mac de Python