ã¢ã«ã´ãªãºã ã®ç解ãæ·±ããçºãRubyã§æ¸ãããå®è£
ããã®ã¾ã¾PHPã«ç§»æ¤ãã¦ã¿ãã
ã
Complement Naive Bayes ããããã®ãRubyã§æ¸ãã - è¨é²ç¨
http://d.hatena.ne.jp/laughing/20101114/1289698415
ã
<?php class CNB { public function __construct($smoothing_parameter=1) { $this->frequency_of_word_by_class = array(); $this->number_of_training_data_of_class = array(); $this->smoothing_parameter = $smoothing_parameter; } public function training($label, $sosei) { if (!isset($this->frequency_of_word_by_class[$label])) { $this->frequency_of_word_by_class[$label] = array(); } foreach ($sosei as $k=>$v) { if (!isset($this->frequency_of_word_by_class[$label][$k])) { $this->frequency_of_word_by_class[$label][$k] = 0; } $this->frequency_of_word_by_class[$label][$k] += $v; } if (!isset($this->number_of_training_data_of_class[$label])) { $this->number_of_training_data_of_class[$label] = 0; } $this->number_of_training_data_of_class[$label]++; } public function total_number_of_word_in_other_class($c) { $all_words = array(); foreach ($this->frequency_of_word_by_class as $k=>$v) { $all_words = array_merge($all_words, array_keys($v)); } $all_words = array_unique($all_words); $other_classes = array_keys($this->frequency_of_word_by_class); $len = count($other_classes); for ($i=0; $i<$len; $i++) { if ($other_classes[$i]==$c) { unset($other_classes[$i]); } } $other_classes = array_values($other_classes); $result = 0; foreach ($other_classes as $c) { foreach ($all_words as $w) { if (isset($this->frequency_of_word_by_class[$c][$w])) { $result += $this->frequency_of_word_by_class[$c][$w]; } } } return $result; } public function number_of_word_in_other_class($c, $w) { $other_classes = array_keys($this->frequency_of_word_by_class); $len = count($other_classes); for ($i=0; $i<$len; $i++) { if ($other_classes[$i]==$c) { unset($other_classes[$i]); } } $other_classes = array_values($other_classes); $result = 0; foreach ($other_classes as $c) { if (isset($this->frequency_of_word_by_class[$c][$w])) { $result += $this->frequency_of_word_by_class[$c][$w]; } } return $result; } public function classifier($sosei) { $all_class = array_keys($this->frequency_of_word_by_class); $all_training_data = array_sum(array_values($this->number_of_training_data_of_class)); $result = array(); foreach ($all_class as $c) { $n_c = $this->total_number_of_word_in_other_class($c); $alpha = $this->smoothing_parameter * count($sosei); $term2nd = 0.0; foreach ($sosei as $k=>$v) { $term2nd += $v * log(($this->number_of_word_in_other_class($c, $k) + $this->smoothing_parameter) / ($n_c + $alpha)); } $theta_c = $this->number_of_training_data_of_class[$c] / $all_training_data; $result[] = array($c, log($theta_c) - $term2nd); } return $result; } } function read_file($fname) { $txt = file_get_contents($fname); $lines = preg_split("/[\r\n]+/", $txt); $result = array(); foreach ($lines as $line) { if (strlen($line)) { list($label, $sosei_txt) = explode(" ", $line, 2); $sosei = array(); $sosei_kv_arr = explode(" ", $sosei_txt); foreach ($sosei_kv_arr as $kv) { list($k, $v) = explode(":", $kv, 2); $sosei[$k] = $v; } $result[$label] = $sosei; } } return $result; } $cnb = new CNB(); $train = read_file($argv[1]); $test = read_file($argv[2]); foreach ($train as $label=>$sosei) { $cnb->training($label, $sosei); } foreach ($test as $label=>$sosei) { print_r($cnb->classifier($sosei)); }
ã
Dropboxã¸ã®ãªã³ã¯ã404ã«ãªã£ã¦ãããã©ãå¤å以ä¸ã®ãããªãã©ã¼ãããã
ã©ãã«1 ã¯ã¼ã1:ã¯ã¼ã1ã®åºç¾æ° ã¯ã¼ã2:ã¯ã¼ã2ã®åºç¾æ° ã¯ã¼ã3:ã¯ã¼ã3ã®åºç¾æ° â¦
å®ç©ã¯ãããªã
label1 aaa:1 bbb:2 ccc:3