grazie: understand common HTML markup and block tags in Javadoc, Properties, and CDATA (IJPL-148256, IJPL-149288)

GitOrigin-RevId: c901095fd88b0b3cde9c37a751014de363498f79
2025-12-15 02:59:33 +07:00 · 2024-08-06 22:46:25 +02:00
parent d2c3e31a19
commit 5cc805fce0
7 changed files with 142 additions and 61 deletions
--- a/plugins/grazie/java/src/main/kotlin/com/intellij/grazie/ide/language/java/JavaTextExtractor.java
+++ b/plugins/grazie/java/src/main/kotlin/com/intellij/grazie/ide/language/java/JavaTextExtractor.java
@@ -38,21 +38,22 @@ public class JavaTextExtractor extends TextExtractor {
    .removingIndents(" \t").removingLineSuffixes(" \t");

  @Override
-  public TextContent buildTextContent(@NotNull PsiElement root, @NotNull Set<TextContent.TextDomain> allowedDomains) {
+  public @NotNull List<TextContent> buildTextContents(@NotNull PsiElement root, @NotNull Set<TextContent.TextDomain> allowedDomains) {
    if (allowedDomains.contains(DOCUMENTATION)) {
      if (root instanceof PsiDocComment) {
-        return HtmlUtilsKt.removeHtml(javadocBuilder.excluding(e -> e instanceof PsiDocTagImpl).build(root, DOCUMENTATION));
+        return HtmlUtilsKt.excludeHtml(javadocBuilder.excluding(e -> e instanceof PsiDocTagImpl).build(root, DOCUMENTATION));
      }
      if (root instanceof PsiDocTagImpl) {
-        return HtmlUtilsKt.removeHtml(javadocBuilder.build(root, DOCUMENTATION));
+        return HtmlUtilsKt.excludeHtml(javadocBuilder.build(root, DOCUMENTATION));
      }
    }

    if (root instanceof PsiCommentImpl && allowedDomains.contains(COMMENTS)) {
      List<PsiElement> roots = PsiUtilsKt.getNotSoDistantSimilarSiblings(root, e ->
        JAVA_PLAIN_COMMENT_BIT_SET.contains(PsiUtilCore.getElementType(e)));
-      return TextContent.joinWithWhitespace('\n', ContainerUtil.mapNotNull(roots, c ->
-        TextContentBuilder.FromPsi.removingIndents(" \t*/").removingLineSuffixes(" \t").build(c, COMMENTS)));
+      return ContainerUtil.createMaybeSingletonList(
+        TextContent.joinWithWhitespace('\n', ContainerUtil.mapNotNull(roots, c ->
+          TextContentBuilder.FromPsi.removingIndents(" \t*/").removingLineSuffixes(" \t").build(c, COMMENTS))));
    }

    if (root instanceof PsiLiteralExpression &&
@@ -66,13 +67,13 @@ public class JavaTextExtractor extends TextExtractor {
            ContainerUtil.map(Text.allOccurrences(Pattern.compile("(?<=\n)" + "\\s{" + indent + "}"), content), Exclusion::exclude));
        }
        content = content.excludeRanges(ContainerUtil.map(Text.allOccurrences(Pattern.compile("\\\\\n"), content), Exclusion::exclude));
-        return content.trimWhitespace();
+        return ContainerUtil.createMaybeSingletonList(content.trimWhitespace());
      }

-      return content;
+      return ContainerUtil.createMaybeSingletonList(content);
    }

-    return null;
+    return List.of();
  }

 }
--- a/plugins/grazie/properties/src/main/kotlin/com/intellij/grazie/ide/language/properties/PropertyTextExtractor.java
+++ b/plugins/grazie/properties/src/main/kotlin/com/intellij/grazie/ide/language/properties/PropertyTextExtractor.java
@@ -14,7 +14,6 @@ import com.intellij.psi.PsiElement;
 import com.intellij.psi.util.PsiUtilCore;
 import com.intellij.util.containers.ContainerUtil;
 import org.jetbrains.annotations.NotNull;
-import org.jetbrains.annotations.Nullable;

 import java.util.List;
 import java.util.Set;
@@ -28,13 +27,13 @@ public class PropertyTextExtractor extends TextExtractor {
  private static final Pattern trailingSlash = Pattern.compile("\\\\\n");

  @Override
-  public @Nullable TextContent buildTextContent(@NotNull PsiElement root,
-                                                @NotNull Set<TextContent.TextDomain> allowedDomains) {
+  protected @NotNull List<TextContent> buildTextContents(@NotNull PsiElement root, @NotNull Set<TextContent.TextDomain> allowedDomains) {
    if (root instanceof PsiComment) {
      List<PsiElement> roots = PsiUtilsKt.getNotSoDistantSimilarSiblings(root, e ->
        PropertiesTokenTypes.COMMENTS.contains(PsiUtilCore.getElementType(e)));
-      return TextContent.joinWithWhitespace('\n', ContainerUtil.mapNotNull(roots, c ->
-        TextContentBuilder.FromPsi.removingIndents(" \t#!").build(c, COMMENTS)));
+      return ContainerUtil.createMaybeSingletonList(
+        TextContent.joinWithWhitespace('\n', ContainerUtil.mapNotNull(roots, c ->
+          TextContentBuilder.FromPsi.removingIndents(" \t#!").build(c, COMMENTS))));
    }
    if (PsiUtilCore.getElementType(root) == PropertiesTokenTypes.VALUE_CHARACTERS) {
      TextContent content = TextContent.builder().build(root, TextContent.TextDomain.PLAIN_TEXT);
@@ -61,8 +60,8 @@ public class PropertyTextExtractor extends TextExtractor {
        }
        content = content.markUnknown(new TextRange(start, end));
      }
-      return HtmlUtilsKt.removeHtml(content);
+      return HtmlUtilsKt.excludeHtml(content);
    }
-    return null;
+    return List.of();
  }
 }
--- a/plugins/grazie/src/main/kotlin/com/intellij/grazie/utils/HtmlUtils.kt
+++ b/plugins/grazie/src/main/kotlin/com/intellij/grazie/utils/HtmlUtils.kt
@@ -4,6 +4,7 @@ package com.intellij.grazie.utils
 import ai.grazie.nlp.utils.takeNonWhitespaces
 import com.intellij.grazie.text.TextContent
 import com.intellij.grazie.text.TextContent.Exclusion
+import com.intellij.grazie.text.TextContent.ExclusionKind
 import com.intellij.openapi.progress.ProgressManager
 import com.intellij.openapi.util.TextRange
 import kotlinx.html.*
@@ -32,9 +33,45 @@ var TD.valign: String

 fun FlowContent.nbsp() = +Entities.nbsp

-private val anyTag = Pattern.compile("</?\\w+[^>]*>")
+private val anyTag = Pattern.compile("</?(\\w+)[^>]*>")
 private val closingTag = Pattern.compile("</\\w+\\s*>")

+@JvmField
+val commonBlockElements: Set<String> =
+  setOf("body", "p", "br", "td", "li", "title", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "table", "ol", "ul")
+
+private val commonMarkupElements = setOf("span", "i", "b", "u", "font", "a", "s", "strong", "sub", "sup")
+
+/**
+ * Remove HTML markup from a text, splitting it at block elements (like {@code <p>}),
+ * marking common HTML markup tags (like {@code <i>}) as markup offsets,
+ * and replacing all other tags with unknown fragments.
+ */
+fun excludeHtml(content: TextContent?): List<TextContent> {
+  if (content == null) return emptyList()
+
+  val components = ArrayList<TextContent>()
+  var lastComponentStart = 0
+  var matchEnd = 0
+  val matcher = anyTag.matcher(content)
+  while (matcher.find(matchEnd)) {
+    matchEnd = matcher.end()
+    ProgressManager.checkCanceled()
+
+    val tagName = matcher.group(1)
+    if (tagName in commonBlockElements) {
+      content.subText(TextRange(lastComponentStart, matcher.start()))?.let(components::add)
+      lastComponentStart = matcher.end()
+    }
+  }
+  content.subText(TextRange(lastComponentStart, content.length))?.let(components::add)
+
+  @Suppress("DEPRECATION")
+  return components.mapNotNull { removeHtml(it)?.trimWhitespace() }
+}
+
+/** Remove HTML markup from a text, replacing it with unknown or markup (for some common HTML tags) offsets. */
+@Deprecated("use excludeHtml", ReplaceWith("excludeHtml"))
 fun removeHtml(_content: TextContent?): TextContent? {
  var content: TextContent = _content ?: return null

@@ -52,6 +89,8 @@ fun removeHtml(_content: TextContent?): TextContent? {
    else null

  fun tagClosed(tagName: String) {
+    if (tagName in commonMarkupElements) return
+
    val openingIndex = exclusions.indexOfLast { openingTagName(it.start, it.end) == tagName && content[it.end - 2] != '/' }
    if (openingIndex >= 0) {
      exclusions[openingIndex] = Exclusion.markUnknown(TextRange(exclusions[openingIndex].start, exclusions.last().end))
@@ -59,13 +98,21 @@ fun removeHtml(_content: TextContent?): TextContent? {
    }
  }

-  for (tagRange in Text.allOccurrences(anyTag, content)) {
+  var matchEnd = 0
+  val matcher = anyTag.matcher(content)
+  while (matcher.find(matchEnd)) {
+    matchEnd = matcher.end()
    ProgressManager.checkCanceled()
-    if (closingTag.matcher(content.subSequence(tagRange.startOffset, tagRange.endOffset)).matches()) {
-      exclusions.add(Exclusion.markUnknown(tagRange))
-      tagClosed(content.substring(tagRange.startOffset + 2, tagRange.endOffset - 1).trim())
-    } else if (openingTagName(tagRange.startOffset, tagRange.endOffset) != null) {
-      exclusions.add(Exclusion.markUnknown(tagRange))
+    val matchStart = matcher.start()
+    val tagName = matcher.group(1)
+    if (!tagName[0].isLetterOrDigit()) continue
+
+    val exclusionKind = if (tagName in commonMarkupElements) ExclusionKind.markup else ExclusionKind.unknown
+    if (closingTag.matcher(content.subSequence(matchStart, matchEnd)).matches()) {
+      exclusions.add(Exclusion(matchStart, matchEnd, exclusionKind))
+      tagClosed(content.substring(matchStart + 2, matchEnd - 1).trim())
+    } else {
+      exclusions.add(Exclusion(matchStart, matchEnd, exclusionKind))
    }
  }
  return content.excludeRanges(exclusions)
--- a/plugins/grazie/src/test/kotlin/com/intellij/grazie/text/TextExtractionTest.java
+++ b/plugins/grazie/src/test/kotlin/com/intellij/grazie/text/TextExtractionTest.java
@@ -20,6 +20,7 @@ import com.intellij.psi.util.PsiTreeUtil;
 import com.intellij.psi.xml.XmlTag;
 import com.intellij.testFramework.fixtures.BasePlatformTestCase;
 import com.intellij.tools.ide.metrics.benchmark.Benchmark;
+import com.intellij.util.containers.ContainerUtil;
 import kotlin.text.StringsKt;
 import one.util.streamex.IntStreamEx;
 import org.intellij.lang.regexp.RegExpLanguage;
@@ -114,11 +115,12 @@ public class TextExtractionTest extends BasePlatformTestCase {
  }

  public void testBrokenPropertyMessageFormat() {
-    assertEquals("a |", unknownOffsets(extractText("a.properties", "a=a {0, choice, 1#1 code fragment|2#{0,number} code fragments", 4)));
+    assertEquals("a|", unknownOffsets(extractText("a.properties", "a=a {0, choice, 1#1 code fragment|2#{0,number} code fragments", 4)));
  }

  public void testExcludePropertyHtml() {
-    assertEquals("Hello |World", unknownOffsets(extractText("a.properties", "a=<html>Hello <p/>World</html>", 8)));
+    List<TextContent> texts = extractTexts("a.properties", "a=<html>Hello <p/><i>World</i></html>", 8, PsiElement.class);
+    assertEquals(List.of("Hello", "World"), ContainerUtil.map(texts, TextContentTest::unknownOffsets));
  }

  public void testMultiLineCommentInProperties() {
@@ -142,10 +144,11 @@ public class TextExtractionTest extends BasePlatformTestCase {
      * @return the offset of {@link #bar} in something
      * @throws Exception when something happens
       */""";
-    TextContent text = extractText("a.java", docText, 6);
-    assertEquals("Hello |,\nhere's an asterisk: *\nand some |.\ntags1 |\ntags2 |\n|is unknown.", unknownOffsets(text));
+    assertEquals(
+      List.of("Hello |,\nhere's an asterisk: *\nand some |.\ntags1 |\ntags2 |one| two", "three| four|\n|is unknown."),
+      ContainerUtil.map(extractTexts("a.java", docText, 6, PsiDocComment.class), TextContentTest::unknownOffsets));

-    text = extractText("a.java", docText, docText.indexOf("the offset"));
+    TextContent text = extractText("a.java", docText, docText.indexOf("the offset"));
    assertEquals("the offset of  in something", text.toString());

    text = extractText("a.java", docText, docText.indexOf("without"));
@@ -274,8 +277,11 @@ public class TextExtractionTest extends BasePlatformTestCase {
    }
    assertEquals("|abc|", unknownOffsets(extractText("a.xml", "<b>abc</b>", 4)));

-    assertEquals("|characters with markup\nand without it|",
-                 unknownOffsets(extractText("a.xml", "<b><![CDATA[\n   characters with markup\n]]>and without it</b>", 22)));
+    {
+      String text = "<b><![CDATA[\n   characters with markup\n]]>and without it</b>";
+      assertEquals("characters with markup", unknownOffsets(extractText("a.xml", text, 22)));
+      assertEquals("and without it|", unknownOffsets(extractText("a.xml", text, 45)));
+    }

    assertEquals("abcd efg", unknownOffsets(extractText("a.xml", "<tag attr=\"abcd efg\"/>", 14)));
    assertEquals("comment", extractText("a.xml", "<!-- comment -->", 10).toString());
@@ -339,7 +345,7 @@ public class TextExtractionTest extends BasePlatformTestCase {
    PsiDocComment comment = PsiTreeUtil.findElementOfClassAtOffset(file, 10, PsiDocComment.class, false);
    TextExtractor extractor = new JavaTextExtractor();
    Benchmark.newBenchmark("TextContent building with HTML removal", () -> {
-      assertEquals(expected, extractor.buildTextContent(comment, TextContent.TextDomain.ALL).toString());
+      assertEquals(expected, assertOneElement(extractor.buildTextContents(comment, TextContent.TextDomain.ALL)).toString());
    }).start();
  }

@@ -362,7 +368,7 @@ public class TextExtractionTest extends BasePlatformTestCase {
    var literal = PsiTreeUtil.findElementOfClassAtOffset(file, 100, PsiLiteralExpression.class, false);
    var extractor = new JavaTextExtractor();
    Benchmark.newBenchmark("TextContent building from a long text fragment", () -> {
-      assertEquals(expected, extractor.buildTextContent(literal, TextContent.TextDomain.ALL).toString());
+      assertEquals(expected, assertOneElement(extractor.buildTextContents(literal, TextContent.TextDomain.ALL)).toString());
    }).start();
  }

@@ -374,7 +380,7 @@ public class TextExtractionTest extends BasePlatformTestCase {
    PsiElement tag = PsiTreeUtil.findElementOfClassAtOffset(file, text.indexOf("something"), PsiDocTag.class, false);
    Benchmark.newBenchmark("TextContent building from complex PSI", () -> {
      for (int i = 0; i < 10; i++) {
-        TextContent content = extractor.buildTextContent(tag, TextContent.TextDomain.ALL);
+        TextContent content = assertOneElement(extractor.buildTextContents(tag, TextContent.TextDomain.ALL));
        assertEquals("something if  is not too expensive", content.toString());
      }
    }).start();
@@ -412,9 +418,17 @@ public class TextExtractionTest extends BasePlatformTestCase {
    return extractText(fileName, fileText, offset, getProject());
  }

-  public static TextContent extractText(String fileName, String fileText, int offset, Project project) {
+  private List<TextContent> extractTexts(String fileName, String text, int offset, Class<? extends PsiElement> psi) {
+    PsiFile file = createFile(fileName, text, getProject());
+    return TextExtractor.findTextsAt(PsiTreeUtil.findElementOfClassAtOffset(file, offset, psi, false), TextContent.TextDomain.ALL);
+  }
+
+  private static PsiFile createFile(String fileName, String fileText, Project project) {
    FileType fileType = FileTypeManager.getInstance().getFileTypeByFileName(fileName);
-    PsiFile file = PsiFileFactory.getInstance(project).createFileFromText(fileName, fileType, fileText);
-    return TextExtractor.findTextAt(file, offset, TextContent.TextDomain.ALL);
+    return PsiFileFactory.getInstance(project).createFileFromText(fileName, fileType, fileText);
+  }
+
+  public static TextContent extractText(String fileName, String fileText, int offset, Project project) {
+    return TextExtractor.findTextAt(createFile(fileName, fileText, project), offset, TextContent.TextDomain.ALL);
  }
 }
--- a/plugins/grazie/src/test/testData/ide/language/java/Docs.java
+++ b/plugins/grazie/src/test/testData/ide/language/java/Docs.java
@@ -17,7 +17,7 @@ class ExampleClassWithNoTypos<T> {
    private String name;

    /**
-     * Creates an empty group.
+     * Creates an empty group. It's a <b>react</b> method.
     *
     * @param name The name of the group. And another sentence.
     */
@@ -60,10 +60,15 @@ class ExampleClassWithNoTypos<T> {
 */
 class ExampleClassWithTypos<T> {

+   /**
+   * There can be many mistakes here. It <GRAMMAR_ERROR descr="IT_VBZ">add</GRAMMAR_ERROR><br>
+    *
+    * <b>It <GRAMMAR_ERROR descr="IT_VBZ">add</GRAMMAR_ERROR></b>
+   */
    private String name;

    /**
-     * Creates an empty group.
+     * Creates an empty group. It's a <GRAMMAR_ERROR descr="A_GOOGLE">react</GRAMMAR_ERROR> method.
     *
     * @param name the <GRAMMAR_ERROR descr="COMMA_WHICH">name which</GRAMMAR_ERROR> group
     */
--- a/plugins/grazie/src/test/testData/ide/language/xml/Example.xml
+++ b/plugins/grazie/src/test/testData/ide/language/xml/Example.xml
@@ -13,4 +13,11 @@
    System.out.println("Hello " + name)
    Hello John
  </code>
+  <description-multiple-paragraphs><![CDATA[
+      There is a possibility of such thing.<p>So that this is possible. And this is <GRAMMAR_ERROR descr="EN_A_VS_AN">an</GRAMMAR_ERROR> mistake.
+  ]]></description-multiple-paragraphs>
+  <description-single-sentence>
+    There is a <GRAMMAR_ERROR descr="POSSIBILTY_POSSIBLE">possibility</GRAMMAR_ERROR> of such thing so that this is possible.
+  </description-single-sentence
+  >
 </shiporder>
--- a/plugins/grazie/xml/main/kotlin/com/intellij/grazie/ide/language/xml/XmlTextExtractor.java
+++ b/plugins/grazie/xml/main/kotlin/com/intellij/grazie/ide/language/xml/XmlTextExtractor.java
@@ -29,7 +29,6 @@ import com.intellij.psi.util.PsiUtilCore;
 import com.intellij.psi.xml.*;
 import com.intellij.util.containers.ContainerUtil;
 import org.jetbrains.annotations.NotNull;
-import org.jetbrains.annotations.Nullable;

 import java.util.*;
 import java.util.function.Function;
@@ -49,35 +48,34 @@ public class XmlTextExtractor extends TextExtractor {
  }

  @Override
-  protected @Nullable TextContent buildTextContent(@NotNull PsiElement element,
-                                                   @NotNull Set<TextContent.TextDomain> allowedDomains) {
+  protected @NotNull List<TextContent> buildTextContents(@NotNull PsiElement element, @NotNull Set<TextContent.TextDomain> allowedDomains) {
    if (isText(element) && hasSuitableDialect(element)) {
      var classifier = tagClassifier(element);
      PsiElement container = SyntaxTraverser.psiApi().parents(element)
        .find(e -> e instanceof XmlDocument || e instanceof XmlTag && classifier.apply((XmlTag)e) != TagKind.Inline);
      if (container != null) {
-        Map<PsiElement, TextContent> contentsInside = CachedValuesManager.getCachedValue(container, () ->
+        Map<PsiElement, List<TextContent>> contentsInside = CachedValuesManager.getCachedValue(container, () ->
          CachedValueProvider.Result.create(calcContents(container), container));
-        return contentsInside.get(element);
+        return contentsInside.getOrDefault(element, List.of());
      }
    }

    IElementType type = PsiUtilCore.getElementType(element);
    if (type == XmlTokenType.XML_COMMENT_CHARACTERS && allowedDomains.contains(COMMENTS) && hasSuitableDialect(element)) {
-      return builder.build(element, COMMENTS);
+      return ContainerUtil.createMaybeSingletonList(builder.build(element, COMMENTS));
    }

    if (type == XmlTokenType.XML_ATTRIBUTE_VALUE_TOKEN && allowedDomains.contains(LITERALS) && hasSuitableDialect(element)) {
      TextContent content = builder.build(element, LITERALS);
      if (content != null && seemsNatural(content)) {
-        return content;
+        return List.of(content);
      }
    }

-    return null;
+    return List.of();
  }

-  private @NotNull Map<PsiElement, TextContent> calcContents(PsiElement container) {
+  private @NotNull Map<PsiElement, List<TextContent>> calcContents(PsiElement container) {
    if (container instanceof XmlTag && isNonText((XmlTag)container)) {
      return Collections.emptyMap();
    }
@@ -88,7 +86,7 @@ public class XmlTextExtractor extends TextExtractor {
    var fullContent = NotNullLazyValue.lazy(() -> TextContent.psiFragment(PLAIN_TEXT, container));

    var visitor = new PsiRecursiveElementWalkingVisitor() {
-      final Map<PsiElement, TextContent> result = new HashMap<>();
+      final Map<PsiElement, List<TextContent>> result = new HashMap<>();
      final List<PsiElement> group = new ArrayList<>();
      final Set<Integer> markupIndices = new HashSet<>();
      final Set<Integer> unknownIndices = new HashSet<>();
@@ -115,7 +113,17 @@ public class XmlTextExtractor extends TextExtractor {
        }

        if (isText(each)) {
-          group.add(each);
+          if (isCdata(each.getParent())) {
+            List<TextContent> contents = HtmlUtilsKt.excludeHtml(
+              extractRange(each.getTextRange().shiftLeft(container.getTextRange().getStartOffset())));
+            if (!contents.isEmpty()) { // isolate CDATA into its own TextContent set for now; maybe glue to the surrounding texts later
+              flushGroup(false);
+              result.put(each, contents);
+              unknownBefore = false;
+            }
+          } else {
+            group.add(each);
+          }
        }
        else if (PsiUtilCore.getElementType(each) == XmlTokenType.XML_CHAR_ENTITY_REF) {
          if (HtmlUtilsKt.isSpaceEntity(each.getText())) {
@@ -127,6 +135,11 @@ public class XmlTextExtractor extends TextExtractor {
        super.visitElement(each);
      }

+      private TextContent extractRange(TextRange range) {
+        TextContent full = fullContent.getValue();
+        return full.excludeRange(new TextRange(range.getEndOffset(), full.length())).excludeRange(new TextRange(0, range.getStartOffset()));
+      }
+
      @Override
      protected void elementFinished(PsiElement element) {
        super.elementFinished(element);
@@ -140,7 +153,7 @@ public class XmlTextExtractor extends TextExtractor {
        List<TextContent> components = new ArrayList<>(group.size());
        for (int i = 0; i < group.size(); i++) {
          PsiElement e = group.get(i);
-          TextContent component = extractRange(fullContent.getValue(), e.getTextRange().shiftLeft(containerStart));
+          TextContent component = extractRange(e.getTextRange().shiftLeft(containerStart));
          component = applyExclusions(i, component, markupIndices, ExclusionKind.markup);
          component = applyExclusions(i, component, unknownIndices, ExclusionKind.unknown);
          components.add(component);
@@ -152,7 +165,7 @@ public class XmlTextExtractor extends TextExtractor {
          content = HtmlUtilsKt.inlineSpaceEntities(content.removeIndents(Set.of(' ', '\t')));
          if (content != null) {
            for (PsiElement e : group) {
-              result.put(e, content);
+              result.put(e, List.of(content));
            }
          }
        }
@@ -178,15 +191,9 @@ public class XmlTextExtractor extends TextExtractor {
    return content.toString().contains(" ");
  }

-  private static TextContent extractRange(TextContent full, TextRange range) {
-    return full.excludeRange(new TextRange(range.getEndOffset(), full.length())).excludeRange(new TextRange(0, range.getStartOffset()));
-  }
-
  private static boolean isText(PsiElement leaf) {
    PsiElement parent = leaf.getParent();
-    if (!(parent instanceof XmlText) &&
-        !(PsiUtilCore.getElementType(parent) == XmlElementType.XML_CDATA && parent.getParent() instanceof XmlText) &&
-        !(parent instanceof XmlDocument)) {
+    if (!(parent instanceof XmlText) && !isCdata(parent) && !(parent instanceof XmlDocument)) {
      return false;
    }

@@ -195,6 +202,10 @@ public class XmlTextExtractor extends TextExtractor {
           type == XmlTokenType.XML_DATA_CHARACTERS;
  }

+  private static boolean isCdata(PsiElement element) {
+    return PsiUtilCore.getElementType(element) == XmlElementType.XML_CDATA;
+  }
+
  private boolean hasSuitableDialect(@NotNull PsiElement element) {
    return myEnabledDialects.contains(element.getContainingFile().getLanguage().getClass());
  }
@@ -216,9 +227,6 @@ public class XmlTextExtractor extends TextExtractor {
      super(HTMLLanguage.class);
    }

-    private static final Set<String> DEFINITELY_BLOCK_TAGS =
-      Set.of("body", "p", "br", "td", "li", "title", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "table");
-
    @Override
    protected Function<XmlTag, TagKind> tagClassifier(@NotNull PsiElement context) {
      if (!Registry.is("grazie.html.concatenate.inline.tag.contents")) {
@@ -230,7 +238,7 @@ public class XmlTextExtractor extends TextExtractor {
      return tag -> {
        String name = tag.getName();
        if (NON_TEXT_TAGS.contains(name)) return TagKind.Unknown;
-        if (DEFINITELY_BLOCK_TAGS.contains(name)) return TagKind.Block;
+        if (HtmlUtilsKt.commonBlockElements.contains(name)) return TagKind.Block;
        if (inlineTags.contains(name)) return TagKind.Inline;
        return TagKind.Unknown;
      };