BeautifulSoupã使ã£ã¦Googleã®è¡¨ç¤ºé ä½ã調ã¹ã
BeautifulSoupã¯HTML解æç¨ã®ã©ã¤ãã©ãªã§ããhtmllib.HTMLParserãHTMLPaprser.HTMLParserã¨éããæ£ãããªãHTMLãæ±ããããã§ããããã使ã£ã¦Googleã®è¡¨ç¤ºé ä½ã調ã¹ãã¹ã¯ãªãããæ¸ãã¦ã¿ã¾ããã
#!python # vim:fileencoding=utf-8 import re import sys import time import urllib2 import urlparse from BeautifulSoup import BeautifulSoup g_url = "http://www.google.co.jp/search?hl=ja&num=100&q=" next_text = u"次ã¸" interval = 3 client_encoding = "cp932" server_encoding = "utf-8" try: keyword, url = sys.argv[1:] except ValueError: sys.exit() print "keyword:", keyword print "url :", url opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] keyword = keyword.decode(client_encoding).encode(server_encoding) search_url = g_url + urllib2.quote(keyword) rank = 0 page = 1 while search_url: print "\rpage : %d" % page, sys.stdout.flush() html = opener.open(search_url).read() soup = BeautifulSoup(html) # HTMLã解æ # Unicodeæååã«å¤æããã # ãã£ãã·ã¥ãé¢é£ãã¼ã¸ã¸ã®ãªã³ã¯ãé¤å¤ãããã®ã§ # classå±æ§ã"l"ã®Aã¿ã°ã ããåå¾ for a in soup.findAll("a", {"class":"l"}): rank += 1 href = a["href"] # hrefå±æ§ã§URLãåå¾ # æå®ãããURLãè¦ã¤ããã°ãçµæã表示ãã¦çµäº if href.startswith(url): # Aã¿ã°ã§å²ã¾ããæååãåå¾ title = "".join([c.string for c in a.contents]) print "\nrank :", rank print "href :", href print "title:", title search_url = "" break # æå®ããURLãè¦ã¤ãããªããã°ã次ã®æ¤ç´¢çµæã調ã¹ã else: # 次ã®æ¤ç´¢çµæã¸ã®ãªã³ã¯ãåå¾ next = soup.find(lambda tag: tag.name=="a" and tag.b and tag.b.string==next_text) if next: search_url = urlparse.urljoin(g_url, next["href"]) page += 1 time.sleep(interval) else: # 次ã¸ã®ãªã³ã¯ãè¦ã¤ãããªããã°çµäº print u"åå¤ã§ã" search_url = ""
$ python test.py "python ã¤ã³ã¹ãã¼ã«" http://www.hlj.com/~tanoue/ keyword: python ã¤ã³ã¹ãã¼ã« url : http://www.hlj.com/~tanoue/ page : 2 rank : 158 href : http://www.hlj.com/~tanoue/Python/Mac/mpy00.html title: Mac de Python
HTMLParser.HTMLParserã§ãªã³ã¯ãæ½åº
#!python # vim:fileencoding=utf-8 from HTMLParser import HTMLParser import urllib2 from urlparse import urlparse class ExtractTextLinkParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.links = [] self.url = "" self.text = "" def handle_starttag(self, tag, attrs): # éå§ã¿ã°ãè¦ã¤ããå ´åã®å¦ç if tag == "a": # ã¿ã°ãå±æ§åã¯å ¨ã¦å°æå attrs = dict(attrs) # ((å±æ§å, å¤), ...) => {å±æ§å:å¤, ...} if "href" in attrs: self.url = attrs["href"] def handle_endtag(self, tag): # çµäºã¿ã°ãè¦ã¤ããå ´åã®å¦ç if tag == "a": if self.text: self.links.append((self.url, self.text)) self.url = self.text = "" def handle_data(self, data): # éå§ã»çµäºã¿ã°ã«å²ã¾ããä¸èº«ã®å¦ç if self.url: self.text += data def get_links(url): response = urllib2.urlopen(url) parser = ExtractTextLinkParser() parser.feed(response.read()) parser.close() links = parser.links return [l for l in links if l[0].find("://") != -1 and not l[0].startswith(url)] links = get_links("http://b.hatena.ne.jp/hotentry") links = [l for l in links if urlparse(l[0])[0]][3:] for url, title in links[:10]: print "[%s:title=%s]" % (url, title.decode("utf-8", "replace"))
テキストエディタでWebサイト構築をガンバル人へ(1/3) − @IT 日本の携帯を高くしている真犯人は 404 Blog Not Found:38歳までに知ることになる、22歳の自分に教えてあげたいたった1つのこと ウェブ制作・プログラマー・デザイナーのためのチートシート集 | コリス やる夫がはてなブックマークを始めたようです。 - 朱雀式 2015年、テレビは「ニコ動」化する?――NRIが示す未来像 (1/2) - ITmedia News パソコン好きが青色申告を体験してみると?:第1回 まずは税金ってナニ? (1/5) - ITmedia Biz.ID らばQ : 42歳までに知ることになる、22歳の自分に教えてあげたい12のこと 「見て欲しい」の本質忘れるな--吉本が語るネット時代の権利者像:コラム - CNET Japan 「真のゆとり教育」が生んだ18歳天才プログラマー トレンド-インタビュー:IT-PLUS
htmllib.HTMLParserã§ãªã³ã¯ãæ½åº
#!python # vim:fileencoding=utf-8 from htmllib import HTMLParser from formatter import NullFormatter import urllib2 from urlparse import urlparse class ExtractTextLinkParser(HTMLParser): def __init__(self): HTMLParser.__init__(self, NullFormatter()) self.links = [] def anchor_bgn(self, href, name, type): # <a>ãè¦ã¤ãã£ãå ´åã®å¦ç HTMLParser.anchor_bgn(self, href, name, type) self.save_bgn() # ããã¹ããã¼ã¿ã®ä¿åãéå§ def anchor_end(self): # </a>ãè¦ã¤ãã£ãå ´åã®å¦ç url = self.anchor text = self.save_end() # ä¿åãããããã¹ããã¼ã¿ãåå¾ if url and text: self.links.append((url, text)) self.anchor = None def get_links(url): response = urllib2.urlopen(url) parser = ExtractTextLinkParser() parser.feed(response.read()) parser.close() return parser.links links = get_links("http://b.hatena.ne.jp/hotentry") links = [l for l in links if urlparse(l[0])[0]] for url, title in links[5:15]: print "[%s:title=%s]" % (url, title.decode("utf-8", "replace"))
日本の携帯を高くしている真犯人は テキストエディタでWebサイト構築をガンバル人へ(1/3) − @IT ウェブ制作・プログラマー・デザイナーのためのチートシート集 | コリス 404 Blog Not Found:38歳までに知ることになる、22歳の自分に教えてあげたいたった1つのこと Gmailアカウント間でのメール移転方法・複数Gmailアカウントの処理に困っている人に朗報! | Google Mania - グーグルの便利な使い方
HTMLParser.HTMLParserã®å ´åã¯è¤æ°ã®ã¿ã°ãå¦çããå ´åãifã§åå²ãããå¿
è¦ãããã¾ãããhtmllib.HTMLParserã®å ´åã¯ã¿ã°ãã¨ã«ã¡ã½ãããç¨æããã¦ãã¾ããã¾ããåã«URLã ããåå¾ããããªããµãã¯ã©ã¹ãä½ããªãã¦ãå¯è½ã§ãã
>>> r = urllib2.urlopen("http://b.hatena.ne.jp/hotentry") >>> p = HTMLParser(NullFormatter()) >>> p.feed(r.read()) >>> p.close() >>> links = p.anchorlist # URLã®ãªã¹ããåå¾ >>> print "\n".join([l for l in links if urlparse(l)[0]][5:10]) http://www.phs-mobile.com/black/black33.html http://www.atmarkit.co.jp/fwcr/rensai/freeauthoring06/freeauthoring06_1.html http://coliss.com/articles/build-websites/operation/work/796.html http://blog.livedoor.jp/dankogai/archives/50997519.html http://google-mania.net/archives/891
éè¤ããè¦ç´ ãåãé¤ã
>>> xs = [5, 8, 5, 1, 1, 4, 2, 4, 3, 2] >>> set(xs) set([1, 2, 3, 4, 5, 8]) >>> sorted(set(xs), key=xs.index) # é åºãç¶æ [5, 8, 1, 4, 2, 3]
æ´æ°ãæ¼¢æ°åã«å¤æ
#!python # vim:fileencoding=utf-8 def num2kanji(num): KNUM = [u"", u"ä¸", u"äº", u"ä¸", u"å", u"äº", u"å ", u"ä¸", u"å «", u"ä¹"] DIGIT1 = (u"", u"å", u"ç¾", u"å") DIGIT2 = (u"", u"ä¸", u"å", u"å ", u"京") try: num = int(num) except ValueError: raise ValueError("not an integer") max = 10000 ** len(DIGIT2) - 1 if not(0 <= num < max): raise ValueError("not in (0-%d)" % max) if num == 0: return u"é¶" str_num = str(num) knum = [] for i in xrange(((len(str_num) + 3) / 4)): sn = str_num[-1-i*4:-5-i*4:-1] if sn != "0000": knum.append(DIGIT2[i] + " ") for j, n in enumerate(map(int, sn)): if n != 0: knum.append(DIGIT1[j]) if not(n == 1 and j): knum.append(KNUM[n]) knum.reverse() return "".join(knum).rstrip() while 1: try: print num2kanji(raw_input(">> ")) except ValueError, e: print e except EOFError: break
>> 0 é¶ >> 1540001 ç¾äºååä¸ ä¸ >> 43005421003 åç¾ä¸åå äºç¾ååäºä¸ åä¸ >> 224767477905006 äºç¾äºååå ä¸åå ç¾ä¸ååå ä¸åä¸ç¾ä¹åä¸ äºåå >> 60093000611220000769 å åä¹äº¬ ä¸åå å åç¾åäºå äºåä¸ ä¸ç¾å åä¹ >> -1 not in (0-99999999999999999999) >> 1000000000000000000000000000 not in (0-99999999999999999999) >> 89.97 not an integer
æååãéé ã«ãã
>>> str = "abcdefg" >>> str[::-1] 'gfedcba' >>> >>> str[0:5:2] # 0çªç®ãã5çªç®ã¾ã§ã2ã¤ããã«åãåºã 'ace' >>> str[-1:-5:-1] # å¾ãã®4ã¤ã®è¦ç´ ãéé ã«åãåºã 'gfed'
wgetã§YouTubeçããåç»ãè½ã¨ã
ä»åº¦ã¯æ£è¦è¡¨ç¾ã使ããã«æ¸ãã¦ã¿ã¾ããã
#!python #encoding=utf-8 import urllib import urllib2 import re import os import sys import time save_dir = r"c:\My Documents" interval = 3 def get_video_detail(url): for host, video in VIDEOS.items(): if url.find(host) != -1: return video.get_detail(url) def _extract_from_to(str, from_, to_=None, to_end=False): start = str.find(from_) if start != -1: start += len(from_) end = None if to_: end = str.find(to_, start) if end == -1 and to_end: end = None if end != -1: return str[start:end] VIDEOS = {} class Video: def __init__(self, id_from_to, dl_url_from_to, title_from_to=None, api_url=None, encoding="utf-8", ext=".flv"): self.id_from_to = id_from_to self.dl_url_from_to = dl_url_from_to self.title_from_to = title_from_to self.api_url = api_url self.encoding = encoding self.ext = ext def get_detail(self, url): id = self._extract_id(url) if self.api_url: url = self.api_url % id content = self._get_content(url) params = self._extract_dl_url_params(content) dl_url = self._build_dl_url(params) title = self._extract_title(content) return id, dl_url, title, self.ext def _extract_id(self, url): from_, to_ = self.id_from_to id = _extract_from_to(url.lower(), from_, to_, True) if id: return id else: raise ValueError("invalid video url") def _get_content(self, url): try: response = urllib2.urlopen(url) return response.read() except urllib2.URLError: raise RuntimeError("unable to download video page") def _extract_dl_url_params(self, content): params = [] for from_, to_ in self.dl_url_from_to: p = _extract_from_to(content, from_, to_) if p: params.append(p) else: raise RuntimeError("unable to extract download url") return params def _build_dl_url(self, params): return params[0] def _extract_title(self, content): if self.title_from_to: from_, to_ = self.title_from_to title = _extract_from_to(content, from_, to_) if title: return title.decode(self.encoding, "ignore") class YouTube(Video): def __init__(self): Video.__init__(self, id_from_to = ("/watch?v=", "&"), dl_url_from_to = [("video_id=", "&"), ("&t=", "&")], title_from_to = ("<title>YouTube - ", "</title>") ) def _build_dl_url(self, params): return "http://www.youtube.com/get_video?video_id=%s&t=%s" %\ tuple(params) VIDEOS["youtube.com"] = YouTube() class Veoh(Video): def __init__(self): Video.__init__(self, id_from_to = ("/videos/", "?"), dl_url_from_to = [('fullPreviewHashPath="', '"')], api_url = "http://www.veoh.com/rest/video/%s/details", title_from_to = ('\ttitle="', '"') ) VIDEOS["www.veoh.com"] = Veoh() class Dailymotion(Video): def __init__(self): Video.__init__(self, id_from_to = ("/video/", None), dl_url_from_to = [("&url=", "&")], title_from_to = ('<h1 class="nav with_uptitle">', "</h1>") ) def _build_dl_url(self, params): return urllib.unquote(params[0]) VIDEOS["dailymotion.com"] = Dailymotion() class AmebaVision(Video): def __init__(self): Video.__init__(self, id_from_to = ("movie=", None), dl_url_from_to = [("<imageUrlLarge>", "</imageUrlLarge>")], api_url = "http://vision.ameba.jp/api/get/detailMovie.do?movie=%s", title_from_to = ("\t<title>", "</title>") ) def _build_dl_url(self, params): dl_url = params[0].replace("//vi", "//vm") dl_url = dl_url.replace("/jpg/", "/flv/") dl_url = dl_url.replace("_4.jpg", ".flv") return dl_url VIDEOS["vision.ameba.jp"] = AmebaVision() class Yourfilehost(Video): def __init__(self): Video.__init__(self, id_from_to = ("cat=video&file=", None), dl_url_from_to = [("&videoembed_id=", "&")] ) def _extract_id(self, url): id = Video._extract_id(self, url) return os.path.splitext(id)[0] def _build_dl_url(self, params): return urllib.unquote(params[0]) VIDEOS["www.yourfilehost.com"] = Yourfilehost() invalid_chr_re = re.compile(u'[\/:*?"<>|]') for url in sys.argv[1:]: try: id, dl_url, title, ext = get_video_detail(url) filename = title or id filename = invalid_chr_re.sub(" ", filename) filepath = os.path.join(save_dir, filename + ext) command = "wget -O '%s' --referer='%s' '%s'" %\ (filepath, url, dl_url) os.system(command) time.sleep(interval) except (ValueError, RuntimeError), e: print "Error: %s :%s" % (e, url)