mirror of
https://gitflic.ru/project/openide/openide.git
synced 2026-04-21 14:01:44 +07:00
[ML in SE]: IJPL-155930 - add features for case-insensitive exact matching.
GitOrigin-RevId: 533b5120eeced432b9ed9b5bd1aabb9c63b25a16
This commit is contained in:
committed by
intellij-monorepo-bot
parent
53bca51df2
commit
45e6712ed8
@@ -0,0 +1,43 @@
|
||||
// Copyright 2000-2024 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
|
||||
package com.intellij.textMatching
|
||||
|
||||
import com.intellij.internal.ml.WordsSplitter
|
||||
import com.intellij.util.text.EditDistance
|
||||
import org.jetbrains.annotations.ApiStatus
|
||||
|
||||
@ApiStatus.Internal
|
||||
object WholeTextMatchUtil {
|
||||
private val wordsSplitter = WordsSplitter.Builder().build()
|
||||
const val baseName = "wholeText"
|
||||
|
||||
/**
|
||||
* Calculates features between the whole element text and the whole query text.
|
||||
*
|
||||
* Used to identify exact matches is SE.
|
||||
*
|
||||
* @return A map containing the following features:
|
||||
* - "levenshtein_distance":The Levenshtein distance between the element text and query text,
|
||||
* normalized by the length of the query text.
|
||||
* - "levenshtein_distance_case_insensitive": The Levenshtein distance between the element text and query text,
|
||||
* normalized by the length of the query text, ignoring case.
|
||||
* - "words_in_query": The number of words in the query text.
|
||||
* - "words_in_element": The number of words in the element text.
|
||||
* - "exactly_matched_words": The number of words that are exactly matched between the query text and element text.
|
||||
*/
|
||||
fun calculateFeatures(elementText: String, queryText: String): Map<String, Any> {
|
||||
|
||||
val levenshteinDistance = EditDistance.levenshtein(elementText, queryText, true).toDouble() / queryText.length
|
||||
val levenshteinDistanceCaseInsensitive = EditDistance.levenshtein(elementText, queryText, false).toDouble() / queryText.length
|
||||
val queryWords = wordsSplitter.split(queryText).map { it.lowercase() }
|
||||
val elementWords = wordsSplitter.split(elementText).map { it.lowercase() }
|
||||
val matchingWordsCount = queryWords.zip(elementWords).count { it.first == it.second }
|
||||
|
||||
return mapOf(
|
||||
"levenshtein_distance" to levenshteinDistance,
|
||||
"levenshtein_distance_case_insensitive" to levenshteinDistanceCaseInsensitive,
|
||||
"words_in_query" to queryWords.size,
|
||||
"words_in_element" to elementWords.size,
|
||||
"exactly_matched_words" to matchingWordsCount
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,146 @@
|
||||
// Copyright 2000-2024 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
|
||||
package com.intellij.textMatching
|
||||
|
||||
import junit.framework.TestCase.assertEquals
|
||||
import org.junit.Test
|
||||
|
||||
class WholeTextMatchUtilTest {
|
||||
private fun assertMapsEquals(expected: Map<*, *>, actual: Map<*, *>) {
|
||||
assertEquals(expected.size, actual.size)
|
||||
assertEquals(expected.keys, actual.keys)
|
||||
for (key in expected.keys) {
|
||||
assertEquals("Value not equal for key $key.", expected[key], actual[key])
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test one word simple`() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("foo", "foo")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 0.0,
|
||||
"levenshtein_distance_case_insensitive" to 0.0,
|
||||
"words_in_query" to 1,
|
||||
"words_in_element" to 1,
|
||||
"exactly_matched_words" to 1
|
||||
), actual)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test one word Capitalized`() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("foo", "Foo")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 1.0 / 3,
|
||||
"levenshtein_distance_case_insensitive" to 0.0,
|
||||
"words_in_query" to 1,
|
||||
"words_in_element" to 1,
|
||||
"exactly_matched_words" to 1
|
||||
), actual)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test other word Capitalized`() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("Foo", "foo")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 1.0 / 3,
|
||||
"levenshtein_distance_case_insensitive" to 0.0,
|
||||
"words_in_query" to 1,
|
||||
"words_in_element" to 1,
|
||||
"exactly_matched_words" to 1
|
||||
), actual)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test query ALL CAPS`() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("Foo", "FOO")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 2.0 / 3,
|
||||
"levenshtein_distance_case_insensitive" to 0.0,
|
||||
"words_in_query" to 1,
|
||||
"words_in_element" to 1,
|
||||
"exactly_matched_words" to 1
|
||||
), actual)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test multi word query`() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("fooBar", "FooBar")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 1.0 / 6,
|
||||
"levenshtein_distance_case_insensitive" to 0.0,
|
||||
"words_in_query" to 2,
|
||||
"words_in_element" to 2,
|
||||
"exactly_matched_words" to 2
|
||||
), actual)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test partial query`() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("FooBar", "foo")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 4.0 / 3,
|
||||
"levenshtein_distance_case_insensitive" to 1.0,
|
||||
"words_in_query" to 1,
|
||||
"words_in_element" to 2,
|
||||
"exactly_matched_words" to 1
|
||||
), actual)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test snake_case to camelCase`() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("foo_bar", "fooBar")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 2.0 / 6,
|
||||
"levenshtein_distance_case_insensitive" to 1.0 / 6,
|
||||
"words_in_query" to 2,
|
||||
"words_in_element" to 2,
|
||||
"exactly_matched_words" to 2
|
||||
), actual)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test camelCase to snake_case`() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("fooBar", "foo_bar")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 2.0 / 7,
|
||||
"levenshtein_distance_case_insensitive" to 1.0 / 7,
|
||||
"words_in_query" to 2,
|
||||
"words_in_element" to 2,
|
||||
"exactly_matched_words" to 2
|
||||
), actual)
|
||||
}
|
||||
@Test
|
||||
fun `test space-split words`() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("fooBar", "foo bar")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 2.0 / 7,
|
||||
"levenshtein_distance_case_insensitive" to 1.0 / 7,
|
||||
"words_in_query" to 2,
|
||||
"words_in_element" to 2,
|
||||
"exactly_matched_words" to 2
|
||||
), actual)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test word order matters`() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("barFoo", "fooBar")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 1.0,
|
||||
"levenshtein_distance_case_insensitive" to 1.0,
|
||||
"words_in_query" to 2,
|
||||
"words_in_element" to 2,
|
||||
"exactly_matched_words" to 0
|
||||
), actual)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun `test not starting with first word `() {
|
||||
val actual = WholeTextMatchUtil.calculateFeatures("barFoo", "fooBar")
|
||||
assertMapsEquals(mapOf<String, Any>(
|
||||
"levenshtein_distance" to 1.0,
|
||||
"levenshtein_distance_case_insensitive" to 1.0,
|
||||
"words_in_query" to 2,
|
||||
"words_in_element" to 2,
|
||||
"exactly_matched_words" to 0
|
||||
), actual)
|
||||
}
|
||||
}
|
||||
@@ -363,7 +363,8 @@ object SearchEverywhereMLStatisticsCollector : CounterUsagesCollector() {
|
||||
listOf(NAME_LENGTH, ML_SCORE_KEY, SIMILARITY_SCORE, IS_SEMANTIC_ONLY)
|
||||
}.map { it.name to it }.toTypedArray()
|
||||
)
|
||||
nameFeatureToField.putAll(SearchEverywhereElementFeaturesProvider.nameFeatureToField.values.map { it.name to it })
|
||||
nameFeatureToField.putAll(SearchEverywhereElementFeaturesProvider.prefixMatchingNameFeatureToField.values.map { it.name to it })
|
||||
nameFeatureToField.putAll(SearchEverywhereElementFeaturesProvider.wholeMatchingNameFeatureToField.values.map { it.name to it })
|
||||
for (featureProvider in SearchEverywhereElementFeaturesProvider.getFeatureProviders()) {
|
||||
nameFeatureToField.putAll(featureProvider.getFeaturesDeclarations().map {
|
||||
it.name to it
|
||||
|
||||
@@ -9,6 +9,7 @@ import com.intellij.openapi.extensions.ExtensionPointName
|
||||
import com.intellij.searchEverywhereMl.ranking.core.searchEverywhereMlRankingService
|
||||
import com.intellij.textMatching.PrefixMatchingType
|
||||
import com.intellij.textMatching.PrefixMatchingUtil
|
||||
import com.intellij.textMatching.WholeTextMatchUtil
|
||||
import org.jetbrains.annotations.ApiStatus
|
||||
import kotlin.math.round
|
||||
|
||||
@@ -17,8 +18,7 @@ abstract class SearchEverywhereElementFeaturesProvider(private val supportedCont
|
||||
constructor(vararg supportedTabs: Class<out SearchEverywhereContributor<*>>) : this(supportedTabs.map { it.simpleName })
|
||||
|
||||
companion object {
|
||||
val EP_NAME: ExtensionPointName<SearchEverywhereElementFeaturesProvider>
|
||||
= ExtensionPointName.create("com.intellij.searcheverywhere.ml.searchEverywhereElementFeaturesProvider")
|
||||
val EP_NAME: ExtensionPointName<SearchEverywhereElementFeaturesProvider> = ExtensionPointName.create("com.intellij.searcheverywhere.ml.searchEverywhereElementFeaturesProvider")
|
||||
|
||||
fun getFeatureProviders(): List<SearchEverywhereElementFeaturesProvider> {
|
||||
return EP_NAME.extensionList.filter { getPluginInfo(it.javaClass).isDevelopedByJetBrains() }
|
||||
@@ -35,7 +35,7 @@ abstract class SearchEverywhereElementFeaturesProvider(private val supportedCont
|
||||
internal val SIMILARITY_SCORE = EventFields.Double("similarityScore")
|
||||
internal val IS_SEMANTIC_ONLY = EventFields.Boolean("isSemanticOnly")
|
||||
|
||||
internal val nameFeatureToField = hashMapOf<String, EventField<*>>(
|
||||
internal val prefixMatchingNameFeatureToField = hashMapOf<String, EventField<*>>(
|
||||
"prefix_same_start_count" to EventFields.Int("${PrefixMatchingUtil.baseName}SameStartCount"),
|
||||
"prefix_greedy_score" to EventFields.Double("${PrefixMatchingUtil.baseName}GreedyScore"),
|
||||
"prefix_greedy_with_case_score" to EventFields.Double("${PrefixMatchingUtil.baseName}GreedyWithCaseScore"),
|
||||
@@ -45,11 +45,21 @@ abstract class SearchEverywhereElementFeaturesProvider(private val supportedCont
|
||||
"prefix_matched_words_with_case_relative" to EventFields.Double("${PrefixMatchingUtil.baseName}MatchedWordsWithCaseRelative"),
|
||||
"prefix_skipped_words" to EventFields.Int("${PrefixMatchingUtil.baseName}SkippedWords"),
|
||||
"prefix_matching_type" to EventFields.String(
|
||||
"${PrefixMatchingUtil.baseName}MatchingType", PrefixMatchingType.values().map { it.name }
|
||||
"${PrefixMatchingUtil.baseName}MatchingType", PrefixMatchingType.entries.map { it.name }
|
||||
),
|
||||
"prefix_exact" to EventFields.Boolean("${PrefixMatchingUtil.baseName}Exact"),
|
||||
"prefix_matched_last_word" to EventFields.Boolean("${PrefixMatchingUtil.baseName}MatchedLastWord"),
|
||||
)
|
||||
internal val wholeMatchingNameFeatureToField = hashMapOf<String, EventField<*>>(
|
||||
"levenshtein_distance" to EventFields.Double("${WholeTextMatchUtil.baseName}LevenshteinDistance",
|
||||
"Levenshtein distance normalized by query lengths"),
|
||||
"levenshtein_distance_case_insensitive" to
|
||||
EventFields.Double("${WholeTextMatchUtil.baseName}LevenshteinDistanceCaseInsensitive",
|
||||
"Levenshtein distance with case insensitive matching, normalized by query length"),
|
||||
"words_in_query" to EventFields.Int("${WholeTextMatchUtil.baseName}WordsInQuery", "Number of words in the query"),
|
||||
"words_in_element" to EventFields.Int("${WholeTextMatchUtil.baseName}WordsInElement", "Number of words in the element text"),
|
||||
"exactly_matched_words" to EventFields.Int("${WholeTextMatchUtil.baseName}ExactlyMatchedWords")
|
||||
)
|
||||
|
||||
|
||||
internal fun roundDouble(value: Double): Double {
|
||||
@@ -87,11 +97,14 @@ abstract class SearchEverywhereElementFeaturesProvider(private val supportedCont
|
||||
NAME_LENGTH.with(nameOfFoundElement.length)
|
||||
)
|
||||
features.forEach { (key, value) ->
|
||||
val field = nameFeatureToField[key]
|
||||
val field = prefixMatchingNameFeatureToField[key]
|
||||
setMatchValueToField(value, field)?.let {
|
||||
result.add(it)
|
||||
}
|
||||
}
|
||||
result.addAll(WholeTextMatchUtil.calculateFeatures(nameOfFoundElement, searchQuery).map { (key, value) ->
|
||||
setMatchValueToField(value, wholeMatchingNameFeatureToField[key])
|
||||
}.filterNotNull())
|
||||
return result
|
||||
}
|
||||
|
||||
@@ -116,6 +129,7 @@ abstract class SearchEverywhereElementFeaturesProvider(private val supportedCont
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
@ApiStatus.Internal
|
||||
fun <T> MutableList<EventPair<*>>.putIfValueNotNull(key: EventField<T>, value: T?) {
|
||||
value?.let {
|
||||
|
||||
Reference in New Issue
Block a user