More efficient optimal alignment edit distance calculation

GitOrigin-RevId: 98b40c9d5aee4ccf3fe82baf0cd7c3b60407f8d0
This commit is contained in:
Bas Leijdekkers
2022-10-06 16:41:09 +02:00
committed by intellij-monorepo-bot
parent 3a2f86df34
commit 254a11b3cf
4 changed files with 70 additions and 29 deletions

View File

@@ -104,7 +104,7 @@ public final class MisspelledHeaderInspection extends LocalInspectionTool {
private void addMatches(String headerName, Collection<String> headers, SortedSet<? super Suggestion> matches) {
for (String candidate : headers) {
int distance = EditDistance.optimalAlignment(headerName, candidate, false);
int distance = EditDistance.optimalAlignment(headerName, candidate, false, MAX_DISTANCE);
if (distance <= MAX_DISTANCE) {
matches.add(new Suggestion(candidate, distance));
}

View File

@@ -1,4 +1,4 @@
// Copyright 2000-2020 JetBrains s.r.o. Use of this source code is governed by the Apache 2.0 license that can be found in the LICENSE file.
// Copyright 2000-2022 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
package com.intellij.util.text;
import org.jetbrains.annotations.NotNull;
@@ -20,20 +20,71 @@ public final class EditDistance {
}
public static int optimalAlignment(@NotNull CharSequence str1, @NotNull CharSequence str2, boolean caseSensitive) {
// extension of the above with additional case of adjacent transpositions
// (http://en.wikipedia.org/wiki/Damerau-Levenshtein_distance#Optimal_string_alignment_distance)
int[][] d = prepare(str1.length(), str2.length());
for (int i = 1; i <= str1.length(); i++) {
for (int j = 1; j <= str2.length(); j++) {
int cost = equal(str1.charAt(i - 1), str2.charAt(j - 1), caseSensitive) ? 0 : 1;
d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
if (i > 1 && j > 1 &&
equal(str1.charAt(i - 1), str2.charAt(j - 2), caseSensitive) && equal(str1.charAt(i - 2), str2.charAt(j - 1), caseSensitive)) {
d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost);
return optimalAlignment(str1, str2, caseSensitive, Integer.MAX_VALUE);
}
/**
* Extension of the Levenshtein distance with additional case of adjacent transpositions using the
* (<a href="https://en.wikipedia.org/wiki/Damerau-Levenshtein_distance#Optimal_string_alignment_distance">Wagner-Fischer algorithm</a>)
* Uses 3(n+1) memory instead of n*m, where n & m are the lengths of the two strings.
*
* @param str1 first string to compare
* @param str2 second string to compare
* @param caseSensitive specify true to compare case sensitively, false to ignore case
* @param limit when the distance becomes greater than the limit, further processing stops.
* To save cpu cycles on strings that are too different.
* @return the number of edits (number of char insertions+deletions+replacements+swaps) difference between the two strings.
*/
public static int optimalAlignment(@NotNull CharSequence str1, @NotNull CharSequence str2, boolean caseSensitive, int limit) {
if (str1.length() > str2.length()) {
@NotNull CharSequence tmp = str1;
str1 = str2;
str2 = tmp;
}
final int length1 = str1.length();
final int length2 = str2.length();
if (length1 == 0) {
return length2;
} else if (length2 == 0) {
return length1;
}
int[] v0 = new int[length1 + 1];
int[] v1 = new int[length1 + 1];
int[] v2 = new int[length1 + 1];// three rows of length n + 1 instead of n*m two dimensional array
for (int i = 1; i <= length1; i++) v1[i] = i;
int minCost = limit + 1; // flip to negative on MAX_INT doesn't matter
for (int j = 0; j < length2; j++) {
v2[0] = j + 1;
for (int i = 0; i < length1; i++) {
final int cost = equal(str1.charAt(i), str2.charAt(j), caseSensitive) ? 0 : 1;
v2[i + 1] = min(v2[i] + 1, // insertion
v1[i + 1] + 1, // deletion
v1[i] + cost); // substitution (replacement)
if(i > 0 && j > 0 &&
equal(str2.charAt(j), str1.charAt(i - 1), caseSensitive) && equal(str1.charAt(i), str2.charAt(j - 1), caseSensitive)) {
// transposition (swap)
v2[i + 1] = Math.min(v2[i + 1], v0[i - 1] + cost);
}
final int currentCost = v2[i + 1];
if (currentCost < minCost) {
minCost = currentCost;
}
}
if (minCost > limit) {
return minCost;
}
minCost = limit + 1;
int[] temp = v0;
v0 = v1;
v1 = v2;
v2 = temp; // will be overwritten/reused
}
return d[str1.length()][str2.length()];
// our last action in the loop above was to switch arrays,
// so v1 has the most recent cost counts
return v1[length1];
}
private static int[][] prepare(int length1, int length2) {

View File

@@ -1,18 +1,4 @@
/*
* Copyright 2000-2015 JetBrains s.r.o.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Copyright 2000-2022 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
package com.intellij.util.text;
import org.junit.Test;
@@ -41,18 +27,22 @@ public class EditDistanceTest {
@Test
public void optimalAlignment() {
assertEquals(0, EditDistance.optimalAlignment("", "", true));
assertEquals(2, EditDistance.optimalAlignment("", "ba", true));
assertEquals(1, EditDistance.optimalAlignment("ab", "ba", true));
assertEquals(2, EditDistance.optimalAlignment("AB", "ba", true));
assertEquals(3, EditDistance.optimalAlignment("ca", "abc", true));
assertEquals(4, EditDistance.optimalAlignment("abcd", "BADC", true));
assertEquals(3, EditDistance.optimalAlignment("Ca", "abc", true));
}
@Test
public void optimalAlignmentCaseInsensitive() {
assertEquals(0, EditDistance.optimalAlignment("", "", false));
assertEquals(2, EditDistance.optimalAlignment("", "ba", false));
assertEquals(1, EditDistance.optimalAlignment("ab", "ba", false));
assertEquals(1, EditDistance.optimalAlignment("AB", "ba", false));
assertEquals(3, EditDistance.optimalAlignment("ca", "abc", false));
assertEquals(2, EditDistance.optimalAlignment("abcd", "BADC", false));
assertEquals(3, EditDistance.optimalAlignment("Ca", "abc", false));
}
}

View File

@@ -117,7 +117,7 @@ public class HtmlUnknownAttributeInspectionBase extends HtmlUnknownElementInspec
XmlAttributeDescriptor[] descriptors = descriptor.getAttributesDescriptors(tag);
int initialSize = quickfixes.size();
for (XmlAttributeDescriptor attr : descriptors) {
if (EditDistance.optimalAlignment(name, attr.getName(), false) <= 1) {
if (EditDistance.optimalAlignment(name, attr.getName(), false, 1) <= 1) {
quickfixes.add(new RenameAttributeFix(attr));
}
if (quickfixes.size() >= initialSize + 3) break;