[grazie] speed up text extraction on large broken XMLs

GitOrigin-RevId: 7d52d3c86ed284e8aa704a29fd64cf20444b9245
2026-01-06 11:50:54 +07:00 · 2022-05-30 13:10:11 +02:00
parent 1b21925d46
commit 1ac04d4675
2 changed files with 32 additions and 8 deletions
--- a/plugins/grazie/src/test/kotlin/com/intellij/grazie/text/TextExtractionTest.java
+++ b/plugins/grazie/src/test/kotlin/com/intellij/grazie/text/TextExtractionTest.java
@@ -213,7 +213,25 @@ public class TextExtractionTest extends BasePlatformTestCase {
        myFixture.type(' ');
        PsiDocumentManager.getInstance(getProject()).commitAllDocuments(); // drop file caches
      })
-      .usesAllCPUCores()
+      .assertTiming();
+  }
+
+  public void testLargeXmlWithUnclosedDoctypePerformance() {
+    String text = "<!DOCTYPE rules [\n<!ENTITY some \"x\">\n<rules> " +
+                  IntStreamEx.range(0, 10_000).mapToObj(i -> "<tag> content" + i + "</tag>\n").joining() +
+                  " </rules>";
+    PsiFile file = myFixture.configureByText("a.xml", text);
+
+    PlatformTestUtil
+      .startPerformanceTest("text extraction", 1_500, () -> {
+        for (PsiElement element : SyntaxTraverser.psiTraverser(file)) {
+          TextExtractor.findTextsAt(element, TextContent.TextDomain.ALL);
+        }
+      })
+      .setup(() -> {
+        myFixture.type(' ');
+        PsiDocumentManager.getInstance(getProject()).commitAllDocuments(); // drop file caches
+      })
      .assertTiming();
  }

--- a/plugins/grazie/xml/main/kotlin/com/intellij/grazie/ide/language/xml/XmlTextExtractor.java
+++ b/plugins/grazie/xml/main/kotlin/com/intellij/grazie/ide/language/xml/XmlTextExtractor.java
@@ -24,9 +24,7 @@ import com.intellij.psi.tree.IElementType;
 import com.intellij.psi.util.CachedValueProvider;
 import com.intellij.psi.util.CachedValuesManager;
 import com.intellij.psi.util.PsiUtilCore;
-import com.intellij.psi.xml.XmlDocument;
-import com.intellij.psi.xml.XmlTag;
-import com.intellij.psi.xml.XmlTokenType;
+import com.intellij.psi.xml.*;
 import com.intellij.util.containers.ContainerUtil;
 import org.jetbrains.annotations.NotNull;
 import org.jetbrains.annotations.Nullable;
@@ -51,8 +49,7 @@ public class XmlTextExtractor extends TextExtractor {
  @Override
  protected @Nullable TextContent buildTextContent(@NotNull PsiElement element,
                                                   @NotNull Set<TextContent.TextDomain> allowedDomains) {
-    IElementType type = PsiUtilCore.getElementType(element);
-    if (isText(type) && hasSuitableDialect(element)) {
+    if (isText(element) && hasSuitableDialect(element)) {
      var classifier = tagClassifier(element);
      PsiElement container = SyntaxTraverser.psiApi().parents(element)
        .find(e -> e instanceof XmlDocument || e instanceof XmlTag && classifier.apply((XmlTag)e) != TagKind.Inline);
@@ -63,6 +60,7 @@ public class XmlTextExtractor extends TextExtractor {
      }
    }

+    IElementType type = PsiUtilCore.getElementType(element);
    if (type == XmlTokenType.XML_COMMENT_CHARACTERS && allowedDomains.contains(COMMENTS) && hasSuitableDialect(element)) {
      return builder.build(element, COMMENTS);
    }
@@ -105,7 +103,7 @@ public class XmlTextExtractor extends TextExtractor {
          unknownBefore = true;
        }

-        if (isText(PsiUtilCore.getElementType(each))) {
+        if (isText(each)) {
          group.add(each);
        }
        super.visitElement(each);
@@ -138,7 +136,15 @@ public class XmlTextExtractor extends TextExtractor {
    return full.excludeRange(new TextRange(range.getEndOffset(), full.length())).excludeRange(new TextRange(0, range.getStartOffset()));
  }

-  private static boolean isText(IElementType type) {
+  private static boolean isText(PsiElement leaf) {
+    PsiElement parent = leaf.getParent();
+    if (!(parent instanceof XmlText) &&
+        !(PsiUtilCore.getElementType(parent) == XmlElementType.XML_CDATA && parent.getParent() instanceof XmlText) &&
+        !(parent instanceof XmlDocument)) {
+      return false;
+    }
+
+    IElementType type = PsiUtilCore.getElementType(leaf);
    return type == XmlTokenType.XML_WHITE_SPACE || type == TokenType.WHITE_SPACE ||
           type == XmlTokenType.XML_CHAR_ENTITY_REF || type == XmlTokenType.XML_DATA_CHARACTERS;
  }