はてなキーワードAPIからはてなキーワード一覧を取得する
こうですか?良くわかりません。
% wget http://d.hatena.ne.jp/images/keyword/keywordlist % python extractkeyword.py keywordlist > keywordlist.extracted % grep java keywordlist.extracted | head -n 10 10688:java 10689:java community process 10690:java computing 10691:java desktop system 10692:java ee 10693:java press 10694:java se 10695:java server faces 10696:java studio 10697:java virtual machine
実体参照の変換等はしていない。あと出力エンコーディングはEUC-JP。たまに変なコードが入っているけど、自分のせいではないはず。
きちんと抽出できているかどうか保証しないので、自己責任でよろしく。
import re __all__ = ['InvalidFormatException', 'parse'] class InvalidFormatException(Exception): pass _SIBLING_PATTERN = re.compile(r'(?<!\\)[\(\)\|]') def parse(source): m = re.match(r'\(\?-xism:\(\?i:\(\?=.*?\)\(\?:(.*)\)\)\)', source) if not m: raise InvalidFormatException() source = m.group(1) return _parse(source) def _parse(source): for parts in split_siblings(source): if has_child(parts): head, child, tail, optional = split_child(parts) if optional: for h in head: for t in tail: yield h + t for s in _parse(child): for h in head: for t in tail: yield h + s + t else: for s in expand(parts): yield s def has_child(body): try: body.index('(?:') except: return False else: return True def split_child(body): start = body.index('(?:', 0) end = body.rindex(')') while body[end - 1] == '\\': end = body.rindex(')', 0, end) head = expand(body[:start]) child = body[start+3:end] if end + 1 < len(body) and body[end + 1] == '?': optional = True end += 1 else: optional = False tail = expand(body[end + 1:]) return (head, child, tail, optional) def split_siblings(source): start = 0 level = 0 result = [] for m in _SIBLING_PATTERN.finditer(source): if m.group() == '(': level += 1 elif m.group() == ')': level -= 1 else: if level == 0: yield source[start:m.start()] start = m.end() yield source[start:] def expand(s, expanded=[""]): i = 0 while i < len(s): if s[i] == '[': characters, i = read_bracket(s, i + 1) expanded = add_character_class(characters, expanded) elif s[i] == '?': expanded += add_optional(expanded) i += 1 elif s[i] == '\\': expanded = add_character(s[i + 1], expanded) i += 2 else: expanded = add_character(s[i], expanded) i += 1 return set(expanded) def add_character(c, expanded): return [s + c for s in expanded] def add_character_class(characters, expanded): result = [] for s in expanded: for c in characters: result.append(s + c) return result def add_optional(expanded): return [s[:-1] for s in expanded] + expanded def read_bracket(s, offset): characters = [] i = offset while s[i] != ']': if s[i] == '\\': characters.append(s[i + 1]) i += 2 else: characters.append(s[i]) i += 1 return (characters, i + 1) if __name__ == '__main__': import sys import itertools def usage(): print >>sys.stderr, 'usage: %s filename' % sys.argv[0] def uniq(iterable): return (it[0] for it in itertools.groupby(iterable)) if len(sys.argv) != 2: usage() sys.exit(1) fp = open(sys.argv[1], "rb") source = fp.read() fp.close() for keyword in uniq(sorted(parse(source))): sys.stdout.write(keyword) sys.stdout.write("\n")