Pythonã®ã¹ãã«ä¿®æ£ããã°ã©ã ãJavaã«ç§»æ¤ãã¦ã¿ã¾ãã
ãªã¬ã³ã¸ãã¥ã¼ã¹ã§紹介されていたãGoogleã®Peter Norvigæ°ã«ãã"スペル修正プログラムはどう書くか"ï¼原文ï¼ãèªãã§ãã¡ãã£ã¨è©¦ãã¦ã¿ããã£ãã®ã§æ·±ãèããã«Javaã«ç§»æ¤ãã¾ãããPythonã®ææ³ãå
¨ãç¥ããªãã£ãã®ã§ãããå
ã
ã¨ã¦ãçãä¸ã«ã³ã¼ãã ãã§ãªã説æããã£ãã®ã§ä½ã¨ãæå¾ã¾ã§å°éã
ã¨ããäºã§ãã¡ã¤ã³ã®ã³ã¼ããè²¼ãä»ãã¾ã*1ããã¹ãã³ã¼ããå«ãã å®å
¨ç(?)ã¯こちらãä¸å¿ååçã¯ã§ããã ãåããã¾ããã
import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * originated from : http://norvig.com/spell.py */ public class SpellCorrect { static final class CountHolder { int count = 2; } private static Matcher words(String text) { Pattern re = Pattern.compile("[a-z]+"); return re.matcher(text.toLowerCase()); } private static Map<String, CountHolder> train(Matcher matcher) { Map<String, CountHolder> model = new HashMap<String, CountHolder>(); while(matcher.find()) { CountHolder newCount = new CountHolder(); CountHolder exists = model.put(matcher.group(), newCount); if(exists != null) { newCount.count = exists.count + 1; } } return model; } private static final Map<String, CountHolder> NWORDS = train(words(readFile("e:/dev/python/big.txt"))); private static final char[] alphabet = "abcdefghijklmnopqrstuvwxyz".toCharArray(); private static Set<String> edits1(String word) { int n = word.length(); Set<String> set = new HashSet<String>(); for(int i = 0; i < n; i++) { set.add(word.substring(0, i) + word.substring(i + 1)); } for(int i = 0; i < n-1; i++) { set.add(word.substring(0, i) + word.charAt(i + 1) + word.charAt(i) + word.substring(i + 2)); } for(int i = 0; i < n; i++) { for(char c : alphabet) { set.add(word.substring(0, i) + c + word.substring(i + 1)); } } for(int i = 0; i < n+1; i++) { for(char c : alphabet) { set.add(word.substring(0, i) + c + word.substring(i)); } } return set; } private static Set<String> known_edits2(String word) { Set<String> set = new HashSet<String>(); for(String e1 : edits1(word)) { for(String e2 : edits1(e1)) { if(NWORDS.containsKey(e2)) { set.add(e2); } } } return set; } private static Set<String> known(Set<String> words) { Set<String> knownWords = new HashSet<String>(); for(String w : words) { if(NWORDS.containsKey(w)) { knownWords.add(w); } } return knownWords; } public static String correct(String word) { Set<String> group; Set<String> input = new HashSet<String>(Arrays.asList(new String[] { word })); group = known(input); if(group.isEmpty()) { group = known(edits1(word)); if(group.isEmpty()) { group = known_edits2(word); if(group.isEmpty()) { group = input; } } } return Collections.max(group, new Comparator<String>() { public int compare(String w1, String w2) { return NWORDS.get(w1).count - NWORDS.get(w2).count; } }); } private static String readFile(String fileName) { BufferedInputStream in = null; try { in = new BufferedInputStream(new FileInputStream(fileName)); byte[] buf = new byte[in.available()]; in.read(buf); return new String(buf, "ISO-8859-1"); } catch (IOException e) { throw new IllegalStateException(e); } finally { if(in != null) { try { in.close(); } catch(Exception ex) {} } } } }
ã¡ãã£ã¨èª¿ã¹ãæããå··ã«Javaã®ã¹ãã«ä¿®æ£ã®ã©ã¤ãã©ãªçãããããã§ããããã£ããã¢ã«ã´ãªãºã ãå¦ã¶ã«ã¯è¯ãã¨æãã¾ããééããªã©è¦ã¤ããããæ¹ã¯ã³ã¡ã³ãã§ãææé ããã¨å¹¸ãã§ãã
ã¡ãªã¿ã«åãã¦Pythonã®ã³ã¼ããèªãã§ã¿ãææ³ã¨ãã¦ã¯ãåããæãã®ãåããé£ãã®ãå¾®å¦ãªã¨ããã§ãããJavaã¨æ¯ã¹ã¦ç´æçï¼Javaã§æ¸ãã¦ã¦ãããæ¸ããããªã¼ãã¨æããããªäºãå®ç¾ã§ãã¦ããï¼ãªé¨åãããã¤ã¤ããããããã¨èªãã¨ééãã®ã§ã¯ï¼ãçãªã¨ãããããã¾ãã®ã§ãä½ãã«ãããRubyã«å
¥éä¸ã®èº«ã¨ãã¦ã¯ãªããªãé¢ç½ãä½é¨ã§ããã
æ³åãã¦ããããã³ã¼ãã¹ã®å¦çã軽ããããªã®ã§ãmonstar.fmã§ãã¢ã¼ãã£ã¹ãã楽æ²ã®æ¤ç´¢ã§ä½¿ãããã¨æã£ã¦ã¾ãã
追è¨
æåRubyã§æ¸ãã¦ã¿ããã¨æã£ãã®ã§ãããLLç³»ã¯èª°ãããã«ããã ããã¨æã£ã¦ãããããã¯ãid:k12uãããPerlã§æ¸ããã¦ãã¾ããã
それPerlで書けるよ(当たり前だ)
å°é£¼ããã¯CPANのライブラリを使って更にAPI化ããã¦ã¾ãã
Javaã§ãããªãDWRããªï¼2.0ãåºã¦ãReverse AjaxãJavaScript Proxy APIãGuiceé£æºãªã©ããªãé¢ç½ã楽ã«ä½¿ããããªæ©è½ãç®ç½æ¼ããªã®ã§ãããã§ãªãã¨ã試ãã¦ã¿ããã¨ããã
*1:追è¨ï¼ãã¹ãã³ã¼ããä¸é¨å«ãã ã¾ã¾ã ã£ãã®ã§åé¤ãã¾ãã