Index: lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java =================================================================== --- lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java (revision 0) +++ lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java (revision 0) @@ -0,0 +1,92 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.English; +import org.apache.lucene.util.LuceneTestCase; + +public class TestDirectSpellChecker extends LuceneTestCase { + + public void testSimpleExamples() throws Exception { + DirectSpellChecker spellChecker = new DirectSpellChecker(); + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, dir, + new MockAnalyzer(MockTokenizer.SIMPLE, true)); + + for (int i = 0; i < 20; i++) { + Document doc = new Document(); + doc.add(newField("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + + IndexReader ir = writer.getReader(); + + BytesRef[] similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].utf8ToString()); + + similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir, false); + if (similar.length > 0) { + assertFalse(similar[0].utf8ToString().equals("five")); // don't suggest a word for itself + } + + similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].utf8ToString()); + + similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].utf8ToString()); + + similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir, false); + assertTrue(similar.length > 0); + assertEquals("five", similar[0].utf8ToString()); + + assertTrue(similar.length > 0); + similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir, false); + assertEquals("five", similar[0].utf8ToString()); + + // add some more documents + for (int i = 1000; i < 1100; i++) { + Document doc = new Document(); + doc.add(newField("numbers", English.intToEnglish(i), Field.Store.NO, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + + ir.close(); + ir = writer.getReader(); + + // look ma, no spellcheck index rebuild + similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10, ir, false); + assertTrue(similar.length > 0); + assertEquals("thousand", similar[0].utf8ToString()); + + ir.close(); + writer.close(); + dir.close(); + } +} Property changes on: lucene\contrib\spellchecker\src\test\org\apache\lucene\search\spell\TestDirectSpellChecker.java ___________________________________________________________________ Added: svn:eol-style + native Index: lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java =================================================================== --- lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java (revision 0) +++ lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java (revision 0) @@ -0,0 +1,146 @@ +package org.apache.lucene.search.spell; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.PriorityQueue; + +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.FuzzyTermsEnum; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.LevenshteinAutomata; + +/** + * Simple automaton-based spellchecker. + *

+ * Candidates are presented directly from the term dictionary, based on + * Levenshtein distance. + */ +public class DirectSpellChecker { + /** maximum edit distance for candidate terms */ + private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; + + /** Get the accuracy 0 < maxEdits <= 2; default 2 */ + public int getMaxEdits() { + return maxEdits; + } + + /** Sets the accuracy 0 < maxEdits <= 2; default 2 */ + public void setMaxEdits(int maxEdits) { + if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) + throw new UnsupportedOperationException("Invalid maxEdits"); + this.maxEdits = maxEdits; + } + + public BytesRef[] suggestSimilar(Term term, int numSug, IndexReader ir, + boolean morePopular) throws IOException { + + if (invalidOrEmptyField(term.field(), ir)) { + return new BytesRef[0]; + } + + int docfreq = morePopular ? ir.docFreq(term) : 0; + + PriorityQueue stQueue = null; + + for (int i = 1; i <= maxEdits; i++) { + stQueue = suggestSimilar(term, numSug, ir, morePopular, docfreq, i); + if (stQueue.size() >= numSug) + break; // we have the closest suggestions + } + + BytesRef terms[] = new BytesRef[stQueue.size()]; + int index = terms.length - 1; + for (final ScoreTerm s : stQueue) { + terms[index--] = s.term; + } + + return terms; + } + + private PriorityQueue suggestSimilar(Term term, int numSug, + IndexReader ir, boolean morePopular, int docfreq, int editDistance) throws IOException { + FuzzyTermsEnum e = new FuzzyTermsEnum(ir, term, editDistance, 0); + final PriorityQueue stQueue = new PriorityQueue(); + + BytesRef queryTerm = new BytesRef(term.text()); + BytesRef candidateTerm; + ScoreTerm st = new ScoreTerm(); + MultiTermQuery.BoostAttribute boostAtt = + e.attributes().addAttribute(MultiTermQuery.BoostAttribute.class); + while ((candidateTerm = e.next()) != null) { + final float boost = boostAtt.getBoost(); + // ignore uncompetitive hits + if (stQueue.size() >= numSug && boost <= stQueue.peek().boost) + continue; + + // ignore exact match of the same term + if (queryTerm.bytesEquals(candidateTerm)) + continue; + + // check docFreq if required + if (morePopular && e.docFreq() <= docfreq) + continue; + // add new entry in PQ + st.term = (BytesRef) candidateTerm.clone(); + st.boost = boost; + stQueue.offer(st); + // possibly drop entries from queue + st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); + boostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); + } + return stQueue; + } + + /* These checks are from MultiTermQuery, maybe move somewhere reusable??? */ + private static boolean invalidOrEmptyField(String field, IndexReader reader) throws IOException { + if (field == null) + return true; + + final Fields fields = MultiFields.getFields(reader); + if (fields == null) { + // reader has no fields + return true; + } + + final Terms terms = fields.terms(field); + if (terms == null) { + // field does not exist + return true; + } + + return false; + } + + private static class ScoreTerm implements Comparable { + public BytesRef term; + public float boost; + + public int compareTo(ScoreTerm other) { + if (this.boost == other.boost) + return other.term.compareTo(this.term); + else + return Float.compare(this.boost, other.boost); + } + } +} Property changes on: lucene\contrib\spellchecker\src\java\org\apache\lucene\search\spell\DirectSpellChecker.java ___________________________________________________________________ Added: svn:eol-style + native