Bayesian Setsã試ãã¦ã¿ã
ãã®åYAPC Asia 2009ã«åå ãã¦ããã®ã§ãããããã§ãã¯ã¦ãªããã¯ãã¼ã¯ã®ã·ã¹ãã ã«ã¤ãã¦ãã®çºè¡¨ã®ä¸ã§ããã¯ã¦ãã®é¢é£ã¨ã³ããªã¯Bayesian Setsã使ã£ã¦è¨ç®ããã¦ãããã¨ãã話ãèãã¦Bayesian Setsã«ä¿ç¶èå³ãæ¹§ãã¦ãã¾ãããBayesian Setsã¯ä»¥åè«æã ãå°ãèªãã§ããã¾ãããåãããªãã¾ã¾æ¾ç½®ãã¦ããã®ã§ããããã£ãããªã®ã§Perlã§ä½ã£ã¦è©¦ãã¦ã¿ã¾ããã
Bayesian Setsã«ã¤ãã¦è©³ããã¯ã以ä¸ã®ãªã³ã¯å ã®è³æããåç §ä¸ããã
å®éã«ä½æããã³ã¼ãã¯ä»¥ä¸ã®éãã§ããä¸è¨ã®Matlabã®ã³ã¼ããåèã«ããã¦ããã ãã¦ãã¾ãã
#!/usr/bin/perl # # Bayesian Sets # # Usage: # % bayesian_sets.pl input.tsv query1 query2 .. # # Reference # - Paper: http://www.gatsby.ucl.ac.uk/~heller/bsets.pdf # - Matlab code: http://chasen.org/~daiti-m/dist/bsets/ # use strict; use warnings; use constant { MAX_OUTPUT => 20, }; sub read_vectors { my $fh = shift; my %vectors; while (my $line = <$fh>) { chomp $line; my @arr = split /\t/, $line; my $doc_id = shift @arr; my %vector = @arr; map { $vector{$_} = 1 } keys %vector; $vectors{$doc_id} = \%vector; } return \%vectors; } sub get_parameters { my ($vectors, $c) = @_; my $average_vector = _average_vector($vectors); my (%alpha, %beta); while (my ($key, $val) = each %{ $average_vector }) { $alpha{$key} = $c * $val; $beta{$key} = $c * (1 - $val); } return (\%alpha, \%beta); } sub calc_similarities { my ($vectors, $queries, $alpha, $beta) = @_; my %query_vector; my $num_query; foreach my $query (@{ $queries }) { if (!exists $vectors->{$query}) { warn "[WARN] '$query' doesn't exist in input documents. Skip it\n"; next; } while (my ($key, $val) = each %{ $vectors->{$query} }) { $query_vector{$key} += $val; } $num_query++; } return if !%query_vector; my $length_qvec = scalar(keys %query_vector); my %weight_vector; foreach my $key (keys %{ $alpha }) { my $val = $query_vector{$key} || 0; $weight_vector{$key} = log(1 + $val / $alpha->{$key}) - log(1 + ($num_query - $val) / $beta->{$key}); } my %score; while (my ($doc_id, $vector) = each %{ $vectors }) { $score{$doc_id} = _inner_product(\%weight_vector, $vector); } return \%score; } sub _average_vector { my $vectors = shift; my %average_vector; my $num_vector = 0; while (my ($doc_id, $vector) = each %{ $vectors }) { while (my ($key, $val) = each %{ $vector }) { $average_vector{$key} += $val; } $num_vector++; } map { $average_vector{$_} /= $num_vector } keys %average_vector; return \%average_vector; } sub _inner_product { my ($v1, $v2) = @_; return 0 if !$v1 || !$v2; my @keys = scalar(keys %{ $v1 }) < scalar(keys %{ $v2 }) ? keys %{ $v1 } : keys %{ $v2 }; my $prod = 0; foreach my $key (@keys) { $prod += $v1->{$key} * $v2->{$key} if $v1->{$key} && $v2->{$key}; } return $prod; } sub main { my $path = shift @ARGV; my @queries = @ARGV; if (!$path || !@queries) { warn "Usage $0 file query1 query2 ..\n"; exit 1; } open my $fh, $path or die "cannot open file: $path"; my $vectors = read_vectors($fh); return if !$vectors; my ($alpha, $beta) = get_parameters($vectors, 2); # c=2 my $score = calc_similarities($vectors, \@queries, $alpha, $beta); my $count = 0; foreach my $doc_id (sort { $score->{$b} <=> $score->{$a} } keys %{ $score }) { last if ++$count > MAX_OUTPUT; printf "%s\t%.3f\n", $doc_id, $score->{$doc_id}; } } main();
å ¥åãã¼ã¿ã«ã¯ä»¥ä¸ã®ãããªã¿ãåºåãã®ããã¹ããã¡ã¤ã«ãç¨æãã¾ãããã¼ã¿ä¾ã¯以前の記事ã§ä½æãããWikipediaã®åãã¼ã¯ã¼ããTFIDFã§éã¿ä»ããããã¼ã¿ã§ããå®éã«ã¯valueé¨åã¯ãããªãã®ã§ããã以åã®åæ®ãã§ãã®ã¾ã¾ã®ãã©ã¼ãããã«ãã¦ãã¾ãã
# document_id \t key1 \t val1 \t key2 \t val2 \t ... 龿·µé¡ é¡ 2365 é·æ·µ 1857 åå³¶ 1555 æ·µ 1460 é¾ 1313 æé®® 1137 ï¼/ 1126 ä¹ 987 串 971 é·å±± 788 ãªã§ã³ã¨ã³ 777 ã¢ã³ã°ã³ã 777 ã¯ãã 777 æé®®æ°ä¸»ä¸»ç¾©äººæ°å ±åå½ 777 ç¾æµ¦ 733 æ´¾ 720 æä¼ 714 æé¢ 708 ãããã 708 ãã£ã³ãµã³ã´ã 708 éæµ¦ 605 å²ä¸ 595 æ¾å· 555 ãããã¹ã¿ã³ã 529 åå² 527 å° 526 岬 513 é 506 å»ºè¨ 505 è 500 å¹´ 496 å岸 490 彿 487 çªç 483 黿µ· 481 åç·¨ 468 夢 460 ã 460 åæé®® 444 宣æ 404 1884 397 ç¸ 397 ã½ã¬ 382 ç 379 ç½å³¶ 360 å¾ 359 ç 344 æä¼å 341 æ¯å 336 å´ 334 ç¥ãã¾ããã«ã¡ã¯ ãã§ã³ã 2332 ãã¥ã³ã¸ã£ 2332 æ ¶å· 1381 æ ç» 1270 人 929 æ 827 ãã§ã³ã»ã ã½ã³ 777 ãã ã»ãã¨ã³ 777 ãã³ã»ãã 777 å½é 756 ãã»ãã£ã³ã 708 è³æ§ 689 ã½ã³ã® 639 ãã 639 å½¼ 624 ç¥ 615 詩人 614 3 574 å¦å©¦ 548 éº»çº 539 å 481 å¿µé¡ 468 è± 458 39 430 Hello 425 ãã¼ã 419 ã¢ã³ 414 å®¶ 413 ã¢ã¹ã¯ã¯ 409 é»è» 399 God 395 16 377 ä¹è» 366 é è¶³ 350 ç¥ãã¾ 342 ç¡è³ 339 !ï¼ 332 ã 331 大人 329 å°å¦ç 309 ã ã¼ãã¼ 298 ãã«ãªã³ 296 ã¨ã 295 å®å®¶ 289 ä¸ 288 åé¡ 286 éæ¹ 272 ç¥æ§ 264 éå½ 263 å ¬é 259 è¤åé é· å¹´ 5208 é 1285 è¤å 1271 è¿è£ 1263 ä½ 1138 å¾ 1118 ä»» 1110 é· 957 å å¹´ 903 3 861 7 813 é¢ 798 è¿è¡ 782 äºä½ä¸æ°é¨æ¨©å¤§è¼ 777 å 770 è¦ 764 4 760 ä¸ 747 æ¥ 747 å·¦ 713 éä¿¡ 708 æ 689 ãã 638 å 629 ä¿ 600 8 599 è½ 595 åè° 572 å® 556 èµäºº 521 ä¹ å® 515 äº 507 å°å° 506 26 503 é¡è½ 500 å¨ä»» 460 2 451 ãµã 448 ä¸å° 448 19 445 å¦å 431 ä¹ 425 æ»å» 424 ä»»å½ 422 å ¼ 416 æ 406 徿 403 é·ç· 401 ãã®é 377 è¾ä»» 377
å®éã«åããã¦ã¿ã¾ããå ¥åããã¥ã¡ã³ãéåã®ãã¡ã¤ã«ã¨ã(è¤æ°ã®)ã¯ã¨ãªãæå®ãã¾ããã¯ã¨ãªã¯å ¥åããã¥ã¡ã³ãéåä¸ã«å«ã¾ãã¦ãã document_id ãã使ç¨ã§ãã¾ããã
% ./bayesian_sets.pl input.tsv ãã¨ã¿èªåè» ãã¨ã¿èªåè» 140.748 è±ç°è±äº 47.205 ãã¨ã¿ã»ãã¨ã¨ã¼ã¹ 45.501 5ãã£ã³ãã« 42.303 ãã«ã·ã§ 40.428 ãã¨ã¿ã»ãã¼ã¯X 37.372 æ¥ç£ã»ãã¼ã 36.659 ãã©ã¼ãã»ã¢ã¼ã¿ã¼ 36.333 ä¸è±èªåè»å·¥æ¥ 33.872 ããã 33.625 ãã³ãã»ã¤ã³ãµã¤ã 32.407 ãã¨ã¿ã»ã¿ã³ãã© 32.004 ãã¨ã¿ã»ã¯ã«ã¼ã¬ã¼ 29.959 ã¸ãª (èªåè») 29.293 ã·ãã¬ã¼ã»ãã£ããªã¨ 29.142 ãã¨ã¿F1 28.830 ä¸è±ã»ãã£ãªã³ 27.994 ãããã»ã¨ã«ã 27.712 ãã¥ã³ãã¤ã»ã½ãã¿ 27.452 å¯å£«éå·¥æ¥ 27.111 % ./bayesian_sets.pl input.tsv å宿è¦å¯ç½² å宿è¦å¯ç½² 203.638 æ»éå·è¦å¯ç½² 80.618 çè¾¼è¦å¯ç½² 59.605 ä¸ä¸æ©è¦å¯ç½² 52.241 ä¸ç°è¦å¯ç½² (æ±äº¬é½) 47.331 åæ±è¦å¯ç½² (æ±äº¬é½) 43.736 çåè¦å¯ç½² 42.404 丸ã®å è¦å¯ç½² 37.594 ä¸éè¦å¯ç½² (æ±äº¬é½) 36.628 åºå³¶æ±è¦å¯ç½² 32.846 ç°ç¡è¦å¯ç½² 31.444 è±ç°å¹¹é¨äº¤çª 31.430 å°éäºè¦å¯ç½² 31.352 尾鷲è¦å¯ç½² 30.760 æ¥ä¸é¨è¦å¯ç½² 30.656 山梨çè¦å¯ 30.290 æ¸å¡çº 30.032 é¿æ±å¹¹é¨äº¤çª 29.785 ç¥å®®å (æ¸è°·åº) 28.061 å¶ç¥å®è¦å¯ç½² 27.630
ã¡ããã¨é¢é£ãã¦ãããã®ãåå¾ã§ãã¦ãã¾ãããæ¬å½ã¯ãapple bananaãã¨ãapple iPodãã®ããã«è¤æ°ã®æå³ãæã¤è¨èã§ãè¤æ°ã®ã¯ã¨ãªãå ¥ããå ´åã«ãã®ã¯ã©ã¹ã¿å ã®åèªãåºã¦ããã®ã確ãããããã¨ããã£ãã®ã§ããã
ä»å使ç¨ããå ¥åãã¼ã¿ã®ãµã¤ãºã¯10ä¸åã®ããã¥ã¡ã³ã(wikipediaã®ãã¼ã¯ã¼ã)ã§ãããå®è¡ã«ã¯25ç§ã»ã©ããã£ã¦ãã¾ããä»åã®ã³ã¼ãã¯å ¥åãã¼ã¿ããã®ã¤ã³ããã¯ã¹ã®ä½æã¨ãã¯ã¨ãªããã®é¡ä¼¼ã¢ã¤ãã ã®æ¤ç´¢ã®ä¸¡æ¹ãåæã«å¦çããã¦ããããåä½ãé ãã§ãããã¤ã³ããã¯ã¹ã¯åãã«ä½æãã¦ä¿åãã¦ãããããã«ãã¦ããã°ãããããã®ã¹ãã¼ãã§æ¤ç´¢ã§ããããã«ãªãã¨æãã¾ãã
ã¾ã Bayesian Setsã®èããã¾ãçè§£ã§ãã¦ããªãã®ã§ããã
- ä¸ããããã¯ã¨ãªãå«ã¾ãã¦ããã¨æãããã¯ã©ã¹ã¿ããªã³ããã³ãã§æ±ãããã®ã¯ã©ã¹ã¿ã«å«ã¾ãã¦ããä»ã®ã¢ã¤ãã ãè¿ã
- ã¯ã¨ãªããæ±ããéã¿ãã¯ãã«ã¨å ¥åããã¥ã¡ã³ãéåããæ§æããè¡åã®ç©(éã¿ãã¯ãã«ã¨å ¨ææ¸ã®å ç©)ãæ±ããã ãã§è¨ç®ã§ãã
ããããããããã¨ãããªãã§ããããï¼ä¸è¬çãªè»¢ç½®ã¤ã³ããã¯ã¹ã使ç¨ããé¡ä¼¼ã³ã³ãã³ããæ±ããæ¹æ³ã§ãåæ§ã®ãã¨ã¯ã§ããããªæ°ãããã®ã§ãããããªã³ããã³ãã«ã¯ã©ã¹ã¿ãæ±ãããã¨ããã¨ããã§ã´ããæ¶ããããã§ããããã¡ãªã¿ã«åç´ã«å ¨ææ¸ã¨ã®cosineè·é¢ã§é¡ä¼¼ãããã®ãåã£ã¦ããå ´åã ã¨ã以ä¸ã®ãããªçµæã«ãªãã¾ãã大æµã¯è¯å¥½ãªçµæãå¾ããã¾ããããã¾ã«ã¯ã¨ãªã«ãã£ã¦ã¯ã´ãã ããã«ãªã£ã¡ãã£ã¦ã¾ããã
% ./cos.pl input.tsv ãã¨ã¿èªåè» ãã¨ã¿èªåè» 1.000 è±ç°è±äº 0.625 æ¾äºç³æ ¹ 0.610 å³¶å´è¤æ 0.609 ééã®æ´å² (æ¥æ¬) 0.598 åä½é»æ°éé 0.592 æ¥ç£ã»ãã¼ã 0.591 æ°é¢å «æ´²å¤ªé 0.588 è¿è¡éå£ 0.588 岡æè²¢ 0.587 é«ç°ä¿é¦¬ 0.587 8æ7æ¥ 0.586 ä»å°æ±ç §å®® 0.584 3æ25æ¥ 0.584 é£é³¥æä»£ 0.583 ä¸è±èªåè»å·¥æ¥ 0.582 è²é¡ç«¶èå ´ 0.580 å¯å£«éå·¥æ¥ 0.580 大æçµ 0.579 ãã³ãã»ã¤ã³ãµã¤ã 0.576 % ./cos.pl input.tsv å宿è¦å¯ç½² å宿è¦å¯ç½² 1.000 é¿æ±å¹¹é¨äº¤çª 0.776 å¨åè¦å¯ç½² 0.775 竹åè¦å¯ç½² 0.773 髿¢è¦å¯ç½² 0.771 æ¥ä¸é¨è¦å¯ç½² 0.769 大çç°è¦å¯ç½² 0.764 ä¸åè¦å¯ç½² 0.758 鳿¸è¦å¯ç½² 0.756 横é è³è¦å¯ç½² 0.754 åæ±è¦å¯ç½² (æ±äº¬é½) 0.753 å±±å£åè¦å¯ç½² 0.752 æ¾æµ¦è¦å¯ç½² 0.752 çåè¦å¯ç½² 0.751 æ»éå·è¦å¯ç½² 0.750 æ±è¿æ±è¦å¯ç½² 0.750 ã¤ããè¦å¯ç½² 0.748 ä¸ä¸æ©è¦å¯ç½² 0.743 ãããè¦å¯ç½² 0.743 丸äºè¦å¯ç½² 0.743
ãã¨ããã®åã®pLSIã®è¨äºã§ã¯çµå±ãã®ã¾ã¾æ¾ç½®ã«ãªã£ã¦ãã®ã§ããããã£ãããªã®ã§ä»åº¦ããããã±ã¼ã¸ã³ã°ãã¦Algorithm::BayesianSetsãä½ããã¨æãã¾ãããã¾ã使ãéã¯ãªãããã§ããããã£ããã©ããªãã®ãç¥ãã®ã«ã¯å¤å°å½¹ã«ç«ã¡ããã§ãããä½ããåèªèº«ãCPAN Authorã«ãªã£ã¦ã¿ããã®ã§â¦^^;
追è¨ï¼weight_vectorãæ±ããã¨ãããä¸é¨ééã£ã¦ãããããã³ã¼ãã¨å®è¡çµæãå®è¡æéãä¿®æ£ãã¾ããããã å®è¡çµæã¯ãã¤ã³ã以å¤ã¯ã»ã¨ãã©éãã¯ãªãã¨æãã¾ãã