#!/usr/bin/env python
# -*- encoding: euc-jp -*-

# ¥â¥¸¥å¡¼¥ë sys ¤È re ¤ò»ÈÍѤ¹¤ë¡£
import sys, re, urllib

# ÆüËܸì¥È¡¼¥¯¥ó¤òÀÚ¤ê½Ð¤¹¤¿¤á¤ÎÀµµ¬É½¸½¡£
JP_TOKEN = re.compile(u"[°ì-óþ]+|[¤¡-¤ó]+|[¥¡-¥ô]+|[a-zA-Z0-9]+")


##  Document - ʸ½ñ¥ª¥Ö¥¸¥§¥¯¥È¤ÎÄêµÁ
##
class Document:

  # ¥³¥ó¥¹¥È¥é¥¯¥¿: Document(docid)
  def __init__(self, docid):
    # docid: ¥É¥­¥å¥á¥ó¥ÈID
    self.docid = docid
    # nwords: ¥É¥­¥å¥á¥ó¥ÈÃæ¤Îñ¸ì¿ô
    self.nwords = 0
    # tf: ñ¸ì¤«¤é TF ¤Ø¤Î¼ÌÁü
    self.tf = {}
    return

  # ¥Ç¥Ð¥Ã¥°ÍÑɽ¼¨¥ë¡¼¥Á¥ó
  def __repr__(self):
    return '<Document: docid=%s, nwords=%d>' % (self.docid, self.nwords)

  # add_string(s): ¥É¥­¥å¥á¥ó¥È¤Ëʸ»úÎós¤òÄɲ乤ë
  def add_string(self, s):
    # Í¿¤¨¤é¤ì¤¿Ê¸»úÎó s ¤ÎÃæ¤Ë¤¢¤ë¥È¡¼¥¯¥ó¤½¤ì¤¾¤ì¤Ë¤Ä¤¤¤Æ
    for word in JP_TOKEN.findall(s):
      # TF ¤ò 1 Áý¤ä¤¹¡£
      if not (word in self.tf):
        self.tf[word] = 0
      self.tf[word] += 1
      # ñ¸ì¿ô¤ò 1 Áý¤ä¤¹¡£
      self.nwords += 1
    return

  # get_word_probability(word): ¤¢¤ëñ¸ì¤Î¸½¤ì¤ë³ÎΨ¤òÆÀ¤ë
  def get_word_probability(self, word):
    try:
      return self.tf[word] / float(self.nwords)
    except KeyError:
      return 0.0


# readdoc(fp,docid):
# Í¿¤¨¤é¤ì¤¿¥Õ¥¡¥¤¥ë¥ª¥Ö¥¸¥§¥¯¥È¤òÆÉ¤ß¤³¤ß Document ¥ª¥Ö¥¸¥§¥¯¥È¤òºîÀ®
def readdoc(fp, docid):
  doc = Document(docid)
  while True:
    line = unicode(fp.readline())
    # ¥Õ¥¡¥¤¥ë¤ÎËöÈø¤Þ¤Ç
    if not line: break
    doc.add_string(line.strip())
  return doc


# test
if __name__ == "__main__":
  # ¥³¥Þ¥ó¥É¥é¥¤¥ó°ú¿ô
  (url, word) = (sys.argv[1], sys.argv[2])
  word = unicode(word)
  print "Reading...", url
  fp = urllib.urlopen(url)
  doc = readdoc(fp, url)
  fp.close()
  print "Document:", doc
  print "Probability of '%s' = %.6f" % (word, doc.get_word_probability(word))