[ML in SE]: IJPL-155930 - add features for case-insensitive exact matching.

GitOrigin-RevId: 533b5120eeced432b9ed9b5bd1aabb9c63b25a16
This commit is contained in:
Samuel Soukup
2024-05-30 21:56:14 +02:00
committed by intellij-monorepo-bot
parent 53bca51df2
commit 45e6712ed8
4 changed files with 210 additions and 6 deletions

View File

@@ -0,0 +1,43 @@
// Copyright 2000-2024 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
package com.intellij.textMatching
import com.intellij.internal.ml.WordsSplitter
import com.intellij.util.text.EditDistance
import org.jetbrains.annotations.ApiStatus
@ApiStatus.Internal
object WholeTextMatchUtil {
private val wordsSplitter = WordsSplitter.Builder().build()
const val baseName = "wholeText"
/**
* Calculates features between the whole element text and the whole query text.
*
* Used to identify exact matches is SE.
*
* @return A map containing the following features:
* - "levenshtein_distance":The Levenshtein distance between the element text and query text,
* normalized by the length of the query text.
* - "levenshtein_distance_case_insensitive": The Levenshtein distance between the element text and query text,
* normalized by the length of the query text, ignoring case.
* - "words_in_query": The number of words in the query text.
* - "words_in_element": The number of words in the element text.
* - "exactly_matched_words": The number of words that are exactly matched between the query text and element text.
*/
fun calculateFeatures(elementText: String, queryText: String): Map<String, Any> {
val levenshteinDistance = EditDistance.levenshtein(elementText, queryText, true).toDouble() / queryText.length
val levenshteinDistanceCaseInsensitive = EditDistance.levenshtein(elementText, queryText, false).toDouble() / queryText.length
val queryWords = wordsSplitter.split(queryText).map { it.lowercase() }
val elementWords = wordsSplitter.split(elementText).map { it.lowercase() }
val matchingWordsCount = queryWords.zip(elementWords).count { it.first == it.second }
return mapOf(
"levenshtein_distance" to levenshteinDistance,
"levenshtein_distance_case_insensitive" to levenshteinDistanceCaseInsensitive,
"words_in_query" to queryWords.size,
"words_in_element" to elementWords.size,
"exactly_matched_words" to matchingWordsCount
)
}
}

View File

@@ -0,0 +1,146 @@
// Copyright 2000-2024 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
package com.intellij.textMatching
import junit.framework.TestCase.assertEquals
import org.junit.Test
class WholeTextMatchUtilTest {
private fun assertMapsEquals(expected: Map<*, *>, actual: Map<*, *>) {
assertEquals(expected.size, actual.size)
assertEquals(expected.keys, actual.keys)
for (key in expected.keys) {
assertEquals("Value not equal for key $key.", expected[key], actual[key])
}
}
@Test
fun `test one word simple`() {
val actual = WholeTextMatchUtil.calculateFeatures("foo", "foo")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 0.0,
"levenshtein_distance_case_insensitive" to 0.0,
"words_in_query" to 1,
"words_in_element" to 1,
"exactly_matched_words" to 1
), actual)
}
@Test
fun `test one word Capitalized`() {
val actual = WholeTextMatchUtil.calculateFeatures("foo", "Foo")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 1.0 / 3,
"levenshtein_distance_case_insensitive" to 0.0,
"words_in_query" to 1,
"words_in_element" to 1,
"exactly_matched_words" to 1
), actual)
}
@Test
fun `test other word Capitalized`() {
val actual = WholeTextMatchUtil.calculateFeatures("Foo", "foo")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 1.0 / 3,
"levenshtein_distance_case_insensitive" to 0.0,
"words_in_query" to 1,
"words_in_element" to 1,
"exactly_matched_words" to 1
), actual)
}
@Test
fun `test query ALL CAPS`() {
val actual = WholeTextMatchUtil.calculateFeatures("Foo", "FOO")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 2.0 / 3,
"levenshtein_distance_case_insensitive" to 0.0,
"words_in_query" to 1,
"words_in_element" to 1,
"exactly_matched_words" to 1
), actual)
}
@Test
fun `test multi word query`() {
val actual = WholeTextMatchUtil.calculateFeatures("fooBar", "FooBar")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 1.0 / 6,
"levenshtein_distance_case_insensitive" to 0.0,
"words_in_query" to 2,
"words_in_element" to 2,
"exactly_matched_words" to 2
), actual)
}
@Test
fun `test partial query`() {
val actual = WholeTextMatchUtil.calculateFeatures("FooBar", "foo")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 4.0 / 3,
"levenshtein_distance_case_insensitive" to 1.0,
"words_in_query" to 1,
"words_in_element" to 2,
"exactly_matched_words" to 1
), actual)
}
@Test
fun `test snake_case to camelCase`() {
val actual = WholeTextMatchUtil.calculateFeatures("foo_bar", "fooBar")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 2.0 / 6,
"levenshtein_distance_case_insensitive" to 1.0 / 6,
"words_in_query" to 2,
"words_in_element" to 2,
"exactly_matched_words" to 2
), actual)
}
@Test
fun `test camelCase to snake_case`() {
val actual = WholeTextMatchUtil.calculateFeatures("fooBar", "foo_bar")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 2.0 / 7,
"levenshtein_distance_case_insensitive" to 1.0 / 7,
"words_in_query" to 2,
"words_in_element" to 2,
"exactly_matched_words" to 2
), actual)
}
@Test
fun `test space-split words`() {
val actual = WholeTextMatchUtil.calculateFeatures("fooBar", "foo bar")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 2.0 / 7,
"levenshtein_distance_case_insensitive" to 1.0 / 7,
"words_in_query" to 2,
"words_in_element" to 2,
"exactly_matched_words" to 2
), actual)
}
@Test
fun `test word order matters`() {
val actual = WholeTextMatchUtil.calculateFeatures("barFoo", "fooBar")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 1.0,
"levenshtein_distance_case_insensitive" to 1.0,
"words_in_query" to 2,
"words_in_element" to 2,
"exactly_matched_words" to 0
), actual)
}
@Test
fun `test not starting with first word `() {
val actual = WholeTextMatchUtil.calculateFeatures("barFoo", "fooBar")
assertMapsEquals(mapOf<String, Any>(
"levenshtein_distance" to 1.0,
"levenshtein_distance_case_insensitive" to 1.0,
"words_in_query" to 2,
"words_in_element" to 2,
"exactly_matched_words" to 0
), actual)
}
}

View File

@@ -363,7 +363,8 @@ object SearchEverywhereMLStatisticsCollector : CounterUsagesCollector() {
listOf(NAME_LENGTH, ML_SCORE_KEY, SIMILARITY_SCORE, IS_SEMANTIC_ONLY)
}.map { it.name to it }.toTypedArray()
)
nameFeatureToField.putAll(SearchEverywhereElementFeaturesProvider.nameFeatureToField.values.map { it.name to it })
nameFeatureToField.putAll(SearchEverywhereElementFeaturesProvider.prefixMatchingNameFeatureToField.values.map { it.name to it })
nameFeatureToField.putAll(SearchEverywhereElementFeaturesProvider.wholeMatchingNameFeatureToField.values.map { it.name to it })
for (featureProvider in SearchEverywhereElementFeaturesProvider.getFeatureProviders()) {
nameFeatureToField.putAll(featureProvider.getFeaturesDeclarations().map {
it.name to it

View File

@@ -9,6 +9,7 @@ import com.intellij.openapi.extensions.ExtensionPointName
import com.intellij.searchEverywhereMl.ranking.core.searchEverywhereMlRankingService
import com.intellij.textMatching.PrefixMatchingType
import com.intellij.textMatching.PrefixMatchingUtil
import com.intellij.textMatching.WholeTextMatchUtil
import org.jetbrains.annotations.ApiStatus
import kotlin.math.round
@@ -17,8 +18,7 @@ abstract class SearchEverywhereElementFeaturesProvider(private val supportedCont
constructor(vararg supportedTabs: Class<out SearchEverywhereContributor<*>>) : this(supportedTabs.map { it.simpleName })
companion object {
val EP_NAME: ExtensionPointName<SearchEverywhereElementFeaturesProvider>
= ExtensionPointName.create("com.intellij.searcheverywhere.ml.searchEverywhereElementFeaturesProvider")
val EP_NAME: ExtensionPointName<SearchEverywhereElementFeaturesProvider> = ExtensionPointName.create("com.intellij.searcheverywhere.ml.searchEverywhereElementFeaturesProvider")
fun getFeatureProviders(): List<SearchEverywhereElementFeaturesProvider> {
return EP_NAME.extensionList.filter { getPluginInfo(it.javaClass).isDevelopedByJetBrains() }
@@ -35,7 +35,7 @@ abstract class SearchEverywhereElementFeaturesProvider(private val supportedCont
internal val SIMILARITY_SCORE = EventFields.Double("similarityScore")
internal val IS_SEMANTIC_ONLY = EventFields.Boolean("isSemanticOnly")
internal val nameFeatureToField = hashMapOf<String, EventField<*>>(
internal val prefixMatchingNameFeatureToField = hashMapOf<String, EventField<*>>(
"prefix_same_start_count" to EventFields.Int("${PrefixMatchingUtil.baseName}SameStartCount"),
"prefix_greedy_score" to EventFields.Double("${PrefixMatchingUtil.baseName}GreedyScore"),
"prefix_greedy_with_case_score" to EventFields.Double("${PrefixMatchingUtil.baseName}GreedyWithCaseScore"),
@@ -45,11 +45,21 @@ abstract class SearchEverywhereElementFeaturesProvider(private val supportedCont
"prefix_matched_words_with_case_relative" to EventFields.Double("${PrefixMatchingUtil.baseName}MatchedWordsWithCaseRelative"),
"prefix_skipped_words" to EventFields.Int("${PrefixMatchingUtil.baseName}SkippedWords"),
"prefix_matching_type" to EventFields.String(
"${PrefixMatchingUtil.baseName}MatchingType", PrefixMatchingType.values().map { it.name }
"${PrefixMatchingUtil.baseName}MatchingType", PrefixMatchingType.entries.map { it.name }
),
"prefix_exact" to EventFields.Boolean("${PrefixMatchingUtil.baseName}Exact"),
"prefix_matched_last_word" to EventFields.Boolean("${PrefixMatchingUtil.baseName}MatchedLastWord"),
)
internal val wholeMatchingNameFeatureToField = hashMapOf<String, EventField<*>>(
"levenshtein_distance" to EventFields.Double("${WholeTextMatchUtil.baseName}LevenshteinDistance",
"Levenshtein distance normalized by query lengths"),
"levenshtein_distance_case_insensitive" to
EventFields.Double("${WholeTextMatchUtil.baseName}LevenshteinDistanceCaseInsensitive",
"Levenshtein distance with case insensitive matching, normalized by query length"),
"words_in_query" to EventFields.Int("${WholeTextMatchUtil.baseName}WordsInQuery", "Number of words in the query"),
"words_in_element" to EventFields.Int("${WholeTextMatchUtil.baseName}WordsInElement", "Number of words in the element text"),
"exactly_matched_words" to EventFields.Int("${WholeTextMatchUtil.baseName}ExactlyMatchedWords")
)
internal fun roundDouble(value: Double): Double {
@@ -87,11 +97,14 @@ abstract class SearchEverywhereElementFeaturesProvider(private val supportedCont
NAME_LENGTH.with(nameOfFoundElement.length)
)
features.forEach { (key, value) ->
val field = nameFeatureToField[key]
val field = prefixMatchingNameFeatureToField[key]
setMatchValueToField(value, field)?.let {
result.add(it)
}
}
result.addAll(WholeTextMatchUtil.calculateFeatures(nameOfFoundElement, searchQuery).map { (key, value) ->
setMatchValueToField(value, wholeMatchingNameFeatureToField[key])
}.filterNotNull())
return result
}
@@ -116,6 +129,7 @@ abstract class SearchEverywhereElementFeaturesProvider(private val supportedCont
return null
}
}
@ApiStatus.Internal
fun <T> MutableList<EventPair<*>>.putIfValueNotNull(key: EventField<T>, value: T?) {
value?.let {