Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java (revision 0) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java (revision 0) @@ -0,0 +1,92 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; + +public class TestDirectSpellChecker extends LuceneTestCase { + + public void testSimpleExamples() throws Exception { + DirectSpellChecker spellChecker = new DirectSpellChecker(); + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig( + TEST_VERSION_CURRENT, new MockAnalyzer(MockTokenizer.SIMPLE, true))); + + for (int i = 0; i < 20; i++) { + Document doc = new Document(); + doc.add(new Field("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + + IndexReader ir = writer.getReader(); + + BytesRef[] similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].utf8ToString()); + + similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir, false); + if (similar.length > 0) { + assertFalse(similar[0].utf8ToString().equals("five")); // don't suggest a word for itself + } + + similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].utf8ToString()); + + similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].utf8ToString()); + + similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].utf8ToString()); + + assertTrue(similar.length > 0); + similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir, false); + assertEquals("five", similar[0].utf8ToString()); + + // add some more documents + for (int i = 1000; i < 1100; i++) { + Document doc = new Document(); + doc.add(new Field("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + + ir.close(); + ir = writer.getReader(); + + // look ma, no spellcheck index rebuild + similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10, ir, false); + assertTrue(similar.length > 0); + assertEquals("thousand", similar[0].utf8ToString()); + + ir.close(); + writer.close(); + } +} Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java (revision 0) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java (revision 0) @@ -0,0 +1,136 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.PriorityQueue; + +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.FuzzyTermsEnum; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.LevenshteinAutomata; + +/** + * Simple automaton-based spellchecker. + *

+ * Candidates are presented directly from the term dictionary, based on + * Levenshtein distance. + */ +public class DirectSpellChecker { + /** maximum edit distance for candidate terms */ + private int maxEdits = 2; + + /** Get the accuracy 0 < maxEdits < 2; default 1 */ + public int getMaxEdits() { + return maxEdits; + } + + /** Sets the accuracy 0 < maxEdits < 2; default 1 */ + public void setMaxEdits(int maxEdits) { + if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) + throw new UnsupportedOperationException("Invalid maxEdits"); + this.maxEdits = maxEdits; + } + + public BytesRef[] suggestSimilar(Term term, int numSug, IndexReader ir, boolean morePopular) throws IOException { + String text = term.text(); + String field = term.field(); + + if (invalidOrEmptyField(field, ir)) { + return new BytesRef[0]; + } + + int length = text.codePointCount(0, text.length()); + int docfreq = morePopular ? ir.docFreq(term) : 0; + float minsim = 1.0f - ((float) (maxEdits+1) / (float) (length)); + FuzzyTermsEnum e = new FuzzyTermsEnum(ir, term, minsim, 0); + final PriorityQueue stQueue = new PriorityQueue(); + + BytesRef queryTerm = new BytesRef(text); + BytesRef candidateTerm; + ScoreTerm st = new ScoreTerm(); + MultiTermQuery.BoostAttribute boostAtt = + e.attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + while ((candidateTerm = e.next()) != null) { + final float boost = boostAtt.getBoost(); + // ignore uncompetitive hits + if (stQueue.size() >= numSug && boost <= stQueue.peek().boost) + continue; + + // ignore exact match of the same term + if (queryTerm.equals(candidateTerm)) + continue; + + // check docFreq if required + if (morePopular && ir.docFreq(field, candidateTerm) <= docfreq) + continue; + // add new entry in PQ + st.term = (BytesRef) candidateTerm.clone(); + st.boost = boost; + stQueue.offer(st); + // possibly drop entries from queue + st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); + boostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + } + + BytesRef terms[] = new BytesRef[stQueue.size()]; + int index = terms.length - 1; + for (final ScoreTerm s : stQueue) { + terms[index--] = s.term; + } + + return terms; + } + + /* These checks are from MultiTermQuery, maybe move somewhere reusable??? */ + private static boolean invalidOrEmptyField(String field, IndexReader reader) throws IOException { + if (field == null) + return true; + + final Fields fields = MultiFields.getFields(reader); + if (fields == null) { + // reader has no fields + return true; + } + + final Terms terms = fields.terms(field); + if (terms == null) { + // field does not exist + return true; + } + + return false; + } + + private static class ScoreTerm implements Comparable { + public BytesRef term; + public float boost; + + public int compareTo(ScoreTerm other) { + if (this.boost == other.boost) + return other.term.compareTo(this.term); + else + return Float.compare(this.boost, other.boost); + } + } +}