Fix exception in low accuracy mode

pemistahl · Jun 9, 2022 · 682da18 · 682da18
1 parent b6ede80
commit 682da18
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -2163,10 +2163,10 @@ The detailed statistics table that contains all accuracy values can be written w
 
 ```
 // Groovy syntax
-implementation 'com.github.pemistahl:lingua:1.2.0'
+implementation 'com.github.pemistahl:lingua:1.2.1'
 
 // Kotlin syntax
-implementation("com.github.pemistahl:lingua:1.2.0")
+implementation("com.github.pemistahl:lingua:1.2.1")
 ```
 
 ### 7.2 Using Maven
@@ -2175,7 +2175,7 @@ implementation("com.github.pemistahl:lingua:1.2.0")
 <dependency>
     <groupId>com.github.pemistahl</groupId>
     <artifactId>lingua</artifactId>
-    <version>1.2.0</version>
+    <version>1.2.1</version>
 </dependency>
 ```
 
@@ -2189,9 +2189,9 @@ cd lingua
 ./gradlew build
 ```
 Several jar archives can be created from the project.
-1. `./gradlew jar` assembles `lingua-1.2.0.jar` containing the compiled sources only.
-2. `./gradlew sourcesJar` assembles `lingua-1.2.0-sources.jar` containing the plain source code.
-3. `./gradlew jarWithDependencies` assembles `lingua-1.2.0-with-dependencies.jar` containing the 
+1. `./gradlew jar` assembles `lingua-1.2.1.jar` containing the compiled sources only.
+2. `./gradlew sourcesJar` assembles `lingua-1.2.1-sources.jar` containing the plain source code.
+3. `./gradlew jarWithDependencies` assembles `lingua-1.2.1-with-dependencies.jar` containing the 
 compiled sources and all external dependencies needed at runtime. This jar file can be included 
 in projects without dependency management systems. It can also be used to 
 run *Lingua* in standalone mode (see below).
@@ -2351,7 +2351,7 @@ LanguageDetectorBuilder.fromIsoCodes639_1(IsoCode639_1.EN, IsoCode639_3.DE)
 LanguageDetectorBuilder.fromIsoCodes639_3(IsoCode639_3.ENG, IsoCode639_3.DEU)
 ```
 
-#### 9.1.6 How to manage memory consumption within application server deployments
+#### 9.1.7 How to manage memory consumption within application server deployments
 
 Internally, *Lingua* efficiently uses all cores of your CPU in order to speed up loading the language
 models and language detection itself. For this purpose, an internal 
@@ -2367,7 +2367,7 @@ from memory but the thread pool will keep running.
 If you want to try out *Lingua* before you decide whether to use it or not, you can run it in a REPL 
 and immediately see its detection results.
 1. With Gradle: `./gradlew runLinguaOnConsole --console=plain`
-2. Without Gradle: `java -jar lingua-1.2.0-with-dependencies.jar`
+2. Without Gradle: `java -jar lingua-1.2.1-with-dependencies.jar`
 
 Then just play around:
 
@@ -2476,8 +2476,8 @@ Take a look at the [planned issues](https://github.com/pemistahl/lingua/mileston
 [codecov badge]: https://codecov.io/gh/pemistahl/lingua/branch/main/graph/badge.svg
 [codecov url]: https://codecov.io/gh/pemistahl/lingua
 [supported languages badge]: https://img.shields.io/badge/supported%20languages-75-green.svg
-[lingua version badge]: https://img.shields.io/badge/Download%20Jar-1.2.0-blue.svg
-[lingua download url]: https://github.com/pemistahl/lingua/releases/download/v1.2.0/lingua-1.2.0-with-dependencies.jar
+[lingua version badge]: https://img.shields.io/badge/Download%20Jar-1.2.1-blue.svg
+[lingua download url]: https://github.com/pemistahl/lingua/releases/download/v1.2.1/lingua-1.2.1-with-dependencies.jar
 [Kotlin platforms badge]: https://img.shields.io/badge/platforms-JDK%206%2B-blue.svg
 [Kotlin platforms url]: https://kotlinlang.org/docs/reference/server-overview.html
 [license badge]: https://img.shields.io/badge/license-Apache%202.0-blue.svg
@@ -2487,8 +2487,8 @@ Take a look at the [planned issues](https://github.com/pemistahl/lingua/mileston
 [Apache OpenNLP]: https://opennlp.apache.org/docs/1.9.3/manual/opennlp.html#tools.langdetect
 [Optimaize Language Detector]: https://github.com/optimaize/language-detector
 [GitHub Packages]: https://github.com/pemistahl/lingua/packages/766181
-[Maven Central]: https://search.maven.org/artifact/com.github.pemistahl/lingua/1.2.0/jar
-[Maven Central badge]: https://img.shields.io/badge/Maven%20Central-1.2.0-green.svg
+[Maven Central]: https://search.maven.org/artifact/com.github.pemistahl/lingua/1.2.1/jar
+[Maven Central badge]: https://img.shields.io/badge/Maven%20Central-1.2.1-green.svg
 [ACCURACY_PLOTS.md]: https://github.com/pemistahl/lingua/blob/main/ACCURACY_PLOTS.md
 [ACCURACY_TABLE.md]: https://github.com/pemistahl/lingua/blob/main/ACCURACY_TABLE.md
 [accuracy reports url]: https://github.com/pemistahl/lingua/tree/main/accuracy-reports

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,3 +1,11 @@
+## Lingua 1.2.1 (released on 09 Jun 2022)
+
+### Bug Fixes
+
+- An exception was thrown when trying to detect the language of unigrams and
+  bigrams in low accuracy mode which operates only with trigrams and larger 
+  strings. This has been fixed.
+
 ## Lingua 1.2.0 (released on 07 Jun 2022)
 
 ### Features

diff --git a/gradle.properties b/gradle.properties
@@ -29,7 +29,7 @@ linguaGroupId=com.github.pemistahl
 
 linguaArtifactId=lingua
 
-linguaVersion=1.2.0
+linguaVersion=1.2.1
 
 linguaName=Lingua
 

diff --git a/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt b/src/main/kotlin/com/github/pemistahl/lingua/api/LanguageDetector.kt
@@ -125,6 +125,10 @@ class LanguageDetector internal constructor(
             return values
         }
 
+        if (isLowAccuracyModeEnabled && cleanedUpText.length < 3) {
+            return values
+        }
+
         val ngramSizeRange = if (cleanedUpText.length >= HIGH_ACCURACY_MODE_MAX_TEXT_LENGTH ||
             isLowAccuracyModeEnabled
         ) {

diff --git a/src/test/kotlin/com/github/pemistahl/lingua/api/LanguageDetectorTest.kt b/src/test/kotlin/com/github/pemistahl/lingua/api/LanguageDetectorTest.kt
@@ -980,6 +980,24 @@ class LanguageDetectorTest {
         assertThatAllLanguageModelsAreLoaded()
     }
 
+    @Test
+    fun `assert that low accuracy mode reports unknown language for unigrams and bigrams`() {
+        removeLanguageModelsFromDetector()
+
+        val detector = LanguageDetectorBuilder
+            .fromLanguages(ENGLISH, GERMAN)
+            .withPreloadedLanguageModels()
+            .withLowAccuracyMode()
+            .build()
+
+        assertThat(detector.detectLanguageOf("bed")).isNotEqualTo(UNKNOWN)
+        assertThat(detector.detectLanguageOf("be")).isEqualTo(UNKNOWN)
+        assertThat(detector.detectLanguageOf("b")).isEqualTo(UNKNOWN)
+        assertThat(detector.detectLanguageOf("")).isEqualTo(UNKNOWN)
+
+        addLanguageModelsToDetector()
+    }
+
     private fun assertThatAllLanguageModelsAreUnloaded() {
         assertThat(LanguageDetector.unigramLanguageModels).isEmpty()
         assertThat(LanguageDetector.bigramLanguageModels).isEmpty()