Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support disabling loading of quadrigram and fivegram models #136

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Support disabling loading of quadrigram and fivegram models
  • Loading branch information
Marcono1234 committed May 22, 2022
commit 461a24378ad297b69c620990fb652718862f8110
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import kotlin.math.ln
class LanguageDetector internal constructor(
internal val languages: MutableSet<Language>,
internal val minimumRelativeDistance: Double,
private val withoutQuadriAndFivegram: Boolean,
isEveryLanguageModelPreloaded: Boolean,
internal val numberOfLoadedLanguages: Int = languages.size,
) {
Expand Down Expand Up @@ -124,7 +125,7 @@ class LanguageDetector internal constructor(
return values
}

val ngramSizeRange = if (cleanedUpText.length >= 120) (3..3) else (1..5)
val ngramSizeRange = if (cleanedUpText.length >= 120) (3..3) else (1..(if (withoutQuadriAndFivegram) 3 else 5))
val allProbabilitiesAndUnigramCounts = ngramSizeRange.filter { i -> cleanedUpText.length >= i }.map { i ->
val testDataModel = TestDataLanguageModel.fromText(cleanedUpText, ngramLength = i)
val probabilities = computeLanguageProbabilities(testDataModel, filteredLanguages)
Expand Down Expand Up @@ -453,22 +454,28 @@ class LanguageDetector internal constructor(
tasks.add(Callable { loadLanguageModels(unigramLanguageModels, language, 1) })
tasks.add(Callable { loadLanguageModels(bigramLanguageModels, language, 2) })
tasks.add(Callable { loadLanguageModels(trigramLanguageModels, language, 3) })
tasks.add(Callable { loadLanguageModels(quadrigramLanguageModels, language, 4) })
tasks.add(Callable { loadLanguageModels(fivegramLanguageModels, language, 5) })

if (!withoutQuadriAndFivegram) {
tasks.add(Callable { loadLanguageModels(quadrigramLanguageModels, language, 4) })
tasks.add(Callable { loadLanguageModels(fivegramLanguageModels, language, 5) })
}
}

threadPool.invokeAll(tasks)
// Call get() to rethrow exceptions which occurred during execution, if any
threadPool.invokeAll(tasks).forEach { it.get() }
}

override fun equals(other: Any?) = when {
this === other -> true
other !is LanguageDetector -> false
languages != other.languages -> false
minimumRelativeDistance != other.minimumRelativeDistance -> false
withoutQuadriAndFivegram != other.withoutQuadriAndFivegram -> false
else -> true
}

override fun hashCode() = 31 * languages.hashCode() + minimumRelativeDistance.hashCode()
override fun hashCode() =
31 * (31 * languages.hashCode() + minimumRelativeDistance.hashCode()) + withoutQuadriAndFivegram.hashCode()

internal companion object {
internal val unigramLanguageModels = enumMapOf<Language, Object2FloatMap<String>>()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,18 @@ package com.github.pemistahl.lingua.api
class LanguageDetectorBuilder private constructor(
internal val languages: List<Language>,
internal var minimumRelativeDistance: Double = 0.0,
internal var withoutQuadriAndFivegram: Boolean = false,
internal var isEveryLanguageModelPreloaded: Boolean = false
) {
/**
* Creates and returns the configured instance of [LanguageDetector].
*/
fun build() = LanguageDetector(languages.toMutableSet(), minimumRelativeDistance, isEveryLanguageModelPreloaded)
fun build() = LanguageDetector(
languages.toMutableSet(),
minimumRelativeDistance,
withoutQuadriAndFivegram,
isEveryLanguageModelPreloaded
)

/**
* Sets the desired value for the minimum relative distance measure.
Expand Down Expand Up @@ -58,6 +64,22 @@ class LanguageDetectorBuilder private constructor(
return this
}

/**
* Configures the language detector to not use quadrigram and fivegram language models for
* language detection. This affects both dynamically loaded models as well as
* [preloaded models][withPreloadedLanguageModels].
Comment on lines +69 to +70
Copy link
Contributor Author

@Marcono1234 Marcono1234 May 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe the wording "dynamically loaded models as well as preloaded models" is a bit misleading. It might not be clear enough what this means by "dynamically loaded", and it might sound as if even a detector with preloaded models could dynamically load models.

Any suggestions for alternative wordings, or is this sentence fine?

*
* Usually quadrigram and fivegram models are quite large and disabling them therefore can
* greatly reduce memory usage during runtime. For larger texts with more than about 150
* characters in cleaned up form (without punctuation and with normalized whitespace) this
* should not have any noticeable effect on the language detection accuracy. However, for
* shorter texts this will make language detection a lot less accurate.
*/
fun withoutQuadrigramAndFivegramModels(): LanguageDetectorBuilder {
this.withoutQuadriAndFivegram = true
return this
}

/**
* Preloads all language models when creating the instance of [LanguageDetector].
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@ class LanguageDetectorBuilderTest {

assertThat(builder.languages).isEqualTo(Language.all())
assertThat(builder.minimumRelativeDistance).isEqualTo(0.0)
assertThat(builder.withoutQuadriAndFivegram).isFalse
assertThat(builder.isEveryLanguageModelPreloaded).isFalse
assertThat(builder.build()).isEqualTo(
LanguageDetector(
Language.all().toMutableSet(),
minimumRelativeDistance = 0.0,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand All @@ -44,6 +46,7 @@ class LanguageDetectorBuilderTest {
LanguageDetector(
Language.all().toMutableSet(),
minimumRelativeDistance = 0.2,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand All @@ -55,11 +58,13 @@ class LanguageDetectorBuilderTest {

assertThat(builder.languages).isEqualTo(Language.allSpokenOnes())
assertThat(builder.minimumRelativeDistance).isEqualTo(0.0)
assertThat(builder.withoutQuadriAndFivegram).isFalse
assertThat(builder.isEveryLanguageModelPreloaded).isFalse
assertThat(builder.build()).isEqualTo(
LanguageDetector(
Language.allSpokenOnes().toMutableSet(),
minimumRelativeDistance = 0.0,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand All @@ -69,6 +74,7 @@ class LanguageDetectorBuilderTest {
LanguageDetector(
Language.allSpokenOnes().toMutableSet(),
minimumRelativeDistance = 0.2,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand Down Expand Up @@ -111,11 +117,13 @@ class LanguageDetectorBuilderTest {

assertThat(builder.languages).isEqualTo(expectedLanguages)
assertThat(builder.minimumRelativeDistance).isEqualTo(0.0)
assertThat(builder.withoutQuadriAndFivegram).isFalse
assertThat(builder.isEveryLanguageModelPreloaded).isFalse
assertThat(builder.build()).isEqualTo(
LanguageDetector(
expectedLanguages.toMutableSet(),
minimumRelativeDistance = 0.0,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand All @@ -125,6 +133,7 @@ class LanguageDetectorBuilderTest {
LanguageDetector(
expectedLanguages.toMutableSet(),
minimumRelativeDistance = 0.2,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand All @@ -145,11 +154,13 @@ class LanguageDetectorBuilderTest {

assertThat(builder.languages).isEqualTo(expectedLanguages)
assertThat(builder.minimumRelativeDistance).isEqualTo(0.0)
assertThat(builder.withoutQuadriAndFivegram).isFalse
assertThat(builder.isEveryLanguageModelPreloaded).isFalse
assertThat(builder.build()).isEqualTo(
LanguageDetector(
expectedLanguages.toMutableSet(),
minimumRelativeDistance = 0.0,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand All @@ -159,6 +170,7 @@ class LanguageDetectorBuilderTest {
LanguageDetector(
expectedLanguages.toMutableSet(),
minimumRelativeDistance = 0.2,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand All @@ -178,11 +190,13 @@ class LanguageDetectorBuilderTest {

assertThat(builder.languages).isEqualTo(expectedLanguages)
assertThat(builder.minimumRelativeDistance).isEqualTo(0.0)
assertThat(builder.withoutQuadriAndFivegram).isFalse
assertThat(builder.isEveryLanguageModelPreloaded).isFalse
assertThat(builder.build()).isEqualTo(
LanguageDetector(
expectedLanguages.toMutableSet(),
minimumRelativeDistance = 0.0,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand All @@ -192,6 +206,7 @@ class LanguageDetectorBuilderTest {
LanguageDetector(
expectedLanguages.toMutableSet(),
minimumRelativeDistance = 0.2,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand All @@ -200,6 +215,7 @@ class LanguageDetectorBuilderTest {
LanguageDetector(
expectedLanguages.toMutableSet(),
minimumRelativeDistance = 0.2,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)
)
Expand All @@ -225,4 +241,56 @@ class LanguageDetectorBuilderTest {
}.withMessage(errorMessage)
}
}

@Test
fun `assert that LanguageDetector can be built with preloaded models`() {
val builder = LanguageDetectorBuilder.fromLanguages(Language.GERMAN, Language.ENGLISH)
.withPreloadedLanguageModels()
val expectedLanguages = listOf(Language.GERMAN, Language.ENGLISH)

assertThat(builder.languages).isEqualTo(expectedLanguages)
assertThat(builder.minimumRelativeDistance).isEqualTo(0.0)
assertThat(builder.withoutQuadriAndFivegram).isFalse
assertThat(builder.isEveryLanguageModelPreloaded).isTrue
assertThat(builder.build()).isEqualTo(
LanguageDetector(
expectedLanguages.toMutableSet(),
minimumRelativeDistance = 0.0,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = true
)
)
}

@Test
fun `assert that LanguageDetector can be built without quadrigram and fivegram models`() {
val builder = LanguageDetectorBuilder.fromLanguages(Language.GERMAN, Language.ENGLISH)
.withoutQuadrigramAndFivegramModels()
val expectedLanguages = listOf(Language.GERMAN, Language.ENGLISH)

assertThat(builder.languages).isEqualTo(expectedLanguages)
assertThat(builder.minimumRelativeDistance).isEqualTo(0.0)
assertThat(builder.withoutQuadriAndFivegram).isTrue
assertThat(builder.isEveryLanguageModelPreloaded).isFalse
assertThat(builder.build()).isEqualTo(
LanguageDetector(
expectedLanguages.toMutableSet(),
minimumRelativeDistance = 0.0,
withoutQuadriAndFivegram = true,
isEveryLanguageModelPreloaded = false
)
)

builder.withPreloadedLanguageModels()
assertThat(builder.withoutQuadriAndFivegram).isTrue
assertThat(builder.isEveryLanguageModelPreloaded).isTrue
assertThat(builder.build()).isEqualTo(
LanguageDetector(
expectedLanguages.toMutableSet(),
minimumRelativeDistance = 0.0,
withoutQuadriAndFivegram = true,
isEveryLanguageModelPreloaded = true
)
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -235,12 +235,14 @@ class LanguageDetectorTest {
private var detectorForEnglishAndGerman = LanguageDetector(
languages = mutableSetOf(ENGLISH, GERMAN),
minimumRelativeDistance = 0.0,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)

private val detectorForAllLanguages = LanguageDetector(
languages = Language.all().toMutableSet(),
minimumRelativeDistance = 0.0,
withoutQuadriAndFivegram = false,
isEveryLanguageModelPreloaded = false
)

Expand Down Expand Up @@ -947,11 +949,23 @@ class LanguageDetectorTest {
fun `assert that language models can be properly unloaded`() {
removeLanguageModelsFromDetector()

assertThat(LanguageDetector.unigramLanguageModels).isEmpty()
assertThat(LanguageDetector.bigramLanguageModels).isEmpty()
assertThat(LanguageDetector.trigramLanguageModels).isEmpty()
assertThat(LanguageDetector.quadrigramLanguageModels).isEmpty()
assertThat(LanguageDetector.fivegramLanguageModels).isEmpty()

val detector = LanguageDetectorBuilder
.fromLanguages(ENGLISH, GERMAN)
.withPreloadedLanguageModels()
.build()

assertThat(LanguageDetector.unigramLanguageModels).isNotEmpty
assertThat(LanguageDetector.bigramLanguageModels).isNotEmpty
assertThat(LanguageDetector.trigramLanguageModels).isNotEmpty
assertThat(LanguageDetector.quadrigramLanguageModels).isNotEmpty
assertThat(LanguageDetector.fivegramLanguageModels).isNotEmpty

detector.unloadLanguageModels()

assertThat(LanguageDetector.unigramLanguageModels).isEmpty()
Expand All @@ -961,6 +975,38 @@ class LanguageDetectorTest {
assertThat(LanguageDetector.fivegramLanguageModels).isEmpty()
}

@Test
fun `assert that loading of quadrigram and fivegram models can be disabled`() {
removeLanguageModelsFromDetector()

assertThat(LanguageDetector.unigramLanguageModels).isEmpty()
assertThat(LanguageDetector.bigramLanguageModels).isEmpty()
assertThat(LanguageDetector.trigramLanguageModels).isEmpty()
assertThat(LanguageDetector.quadrigramLanguageModels).isEmpty()
assertThat(LanguageDetector.fivegramLanguageModels).isEmpty()

val detector = LanguageDetectorBuilder
.fromLanguages(ENGLISH, GERMAN)
.withoutQuadrigramAndFivegramModels()
.withPreloadedLanguageModels()
.build()

assertThat(LanguageDetector.unigramLanguageModels).isNotEmpty
assertThat(LanguageDetector.bigramLanguageModels).isNotEmpty
assertThat(LanguageDetector.trigramLanguageModels).isNotEmpty
assertThat(LanguageDetector.quadrigramLanguageModels).isEmpty()
assertThat(LanguageDetector.fivegramLanguageModels).isEmpty()

// Detection of short text should not trigger loading quadrigram and fivegram models
detector.detectLanguageOf("a very short sentence")

assertThat(LanguageDetector.unigramLanguageModels).isNotEmpty
assertThat(LanguageDetector.bigramLanguageModels).isNotEmpty
assertThat(LanguageDetector.trigramLanguageModels).isNotEmpty
assertThat(LanguageDetector.quadrigramLanguageModels).isEmpty()
assertThat(LanguageDetector.fivegramLanguageModels).isEmpty()
}

private fun addLanguageModelsToDetector() {
LanguageDetector.unigramLanguageModels[ENGLISH] = unigramLanguageModelForEnglish
LanguageDetector.unigramLanguageModels[GERMAN] = unigramLanguageModelForGerman
Expand Down