Yahoo! å½¢æ ç´ è§£æ API for Python
Pythonã§ã¯ã¾ã ãªãããã ã£ãã®ã§ãä½ã£ãã
# -*- coding: utf-8 -*- from urllib import urlopen, urlencode from lxml import etree from formencode import Schema import formencode.validators as validators ADJECTIVE = 1 # å½¢å®¹è© ADJECTIVAL_NOUN = 2 # 形容åè© INTERJECTION = 3 # æåè© ADVERB = 4 # å¯è© ADNOMIAL = 5 # é£ä½è© CONJUNCTION = 6 # æ¥ç¶è© PREFIX = 7 # æ¥é è¾ SUFFIX = 8 # æ¥å°¾è¾ NOUN = 9 # åè© VERB = 10 # åè© PARTICLE = 11 # å©è© AUX_VERB = 12 # å©åè© OTHER = 13 # ç¹æ® (å¥èªç¹, ã«ãã³, è¨å·ãªã©) POS_MIN = 1 POS_MAX = 14 def MASchema(response="", uniq=False): s = Schema() s.filter_extra_fields = True s.allow_extra_fields = True s.ignore_key_missing = True response = (response != "") and response.split(",") \ or ["surface", "reading", "pos"] for name, validator, default in \ (("surface", validators.UnicodeString, ""), ("reading", validators.UnicodeString, ""), ("pos", validators.UnicodeString, ""), ("baseform", validators.UnicodeString, ""), ("feature", validators.UnicodeString, "")): if name in response: s.add_field(name, validator(if_missing=default)) if uniq: s.add_field("count", validators.Int(if_missing=0)) return s def _strip_namespace(tag, ns): if tag.startswith("{%s}" % ns): return tag[len(ns)+2:] else: return tag class WebMA(object): base_url = 'http://api.jlp.yahoo.co.jp/MAService/V1/parse' namespace = 'urn:yahoo:jp:jlp' default_app_id = 'Yahoo! MAService API for Python' default_response = "surface,reading,pos,baseform,feature" default_uniq_response = "surface,reading,pos,baseform,feature,count" def __init__(self, app_id=None, uniq=False, uniq_by_baseform=False, filter=None, response=None, validator=None): self.app_id = app_id or self.default_app_id self.uniq = uniq self.uniq_by_baseform = uniq_by_baseform self.filter = filter or "" self.response = response self.validator = validator if self.response is None: self.response = self.uniq and self.default_uniq_response \ or self.default_response def make_params(self, sentence, filter=None): if filter is None: filter = self.filter d = dict(appid=self.app_id, results=self.uniq and "uniq" or "ma", filter=filter, response=self.response, sentence=sentence,) if self.uniq: d["uniq_by_baseform"] = self.uniq_by_baseform return urlencode(d) def parse(self, sentence, response=None, filter=None, use_post=True, urlopen=urlopen): if isinstance(sentence, unicode): sentence = sentence.encode('utf-8') else: # caller must provide utf-8 encoded string for 'sentence' pass params = self.make_params(sentence, filter) if use_post: url = self.base_url data = params else: url = self.base_url + '?' + params data = None et = etree.parse(urlopen(url, data)) root_tag = _strip_namespace(et.getroot().tag, self.namespace) if root_tag != "ResultSet": msgs = [] if root_tag == "Error": msgs = et.xpath("./ns:Message", {'ns': self.namespace}) raise IOError(len(msgs) > 0 and msgs[0].text \ or "something wrong") if response is None: response = self.response validator = self.validator or \ MASchema(response, self.uniq) for word in et.xpath(".//ns:word_list/ns:word", {'ns': self.namespace}): d = {} for e in word: d[_strip_namespace(e.tag, self.namespace)] = e.text yield validator.to_python(d) __all__ = ['MASchema', 'WebMA'] for name, value in globals().items(): if name.isupper() and isinstance(value, int): __all__.append(name) if __name__ == '__main__': import sys, locale encoding = locale.getpreferredencoding() or \ sys.getdefaultencoding() ma = WebMA() for line in sys.stdin: for w in ma.parse(line.rstrip().decode(encoding)): print "%(surface)s\t%(feature)s" % w print "EOS"
ãã¹ãï¼
% echo "ãããããããããã®ãã¡" | python webma.py ããã åè©,åè©,*,ããã,ããã,ããã ã å©è©,ä¿å©è©,*,ã,ã,ã ãã åè©,ãäº,æªç¶ã¦æ¥ç¶,ãã,ãã,ãã ãã åè©,ãäº,æªç¶ã¦æ¥ç¶,ãã,ãã,ãã ã å©è©,ä¿å©è©,*,ã,ã,ã ã® å©è©,å©è©é£ä½å,*,ã®,ã®,ã® ãã¡ åè©,åè©,*,ãã¡,ãã¡,ãã¡ EOS
ã¡ãªã¿ã«åãå ¥åã«å¯¾ããMeCabã®åºåã¯ã
% echo "ãããããããããã®ãã¡" | mecab ããã åè©,ä¸è¬,*,*,*,*,ããã,ã¹ã¢ã¢,ã¹ã¢ã¢ ã å©è©,ä¿å©è©,*,*,*,*,ã,ã¢,㢠ãã åè©,ä¸è¬,*,*,*,*,ãã,ã¢ã¢,ã¢ã¢ ã å©è©,ä¿å©è©,*,*,*,*,ã,ã¢,㢠ãã åè©,ä¸è¬,*,*,*,*,ãã,ã¢ã¢,ã¢ã¢ ã® å©è©,é£ä½å,*,*,*,*,ã®,ã,ã ãã¡ åè©,éèªç«,å¯è©å¯è½,*,*,*,ãã¡,ã¦ã,ã¦ã EOS