diff --git a/plugins/grazie/java/src/main/kotlin/com/intellij/grazie/ide/language/java/JavaTextExtractor.java b/plugins/grazie/java/src/main/kotlin/com/intellij/grazie/ide/language/java/JavaTextExtractor.java index 6e0bfa1811a0..cb448d8aa1ff 100644 --- a/plugins/grazie/java/src/main/kotlin/com/intellij/grazie/ide/language/java/JavaTextExtractor.java +++ b/plugins/grazie/java/src/main/kotlin/com/intellij/grazie/ide/language/java/JavaTextExtractor.java @@ -38,21 +38,22 @@ public class JavaTextExtractor extends TextExtractor { .removingIndents(" \t").removingLineSuffixes(" \t"); @Override - public TextContent buildTextContent(@NotNull PsiElement root, @NotNull Set allowedDomains) { + public @NotNull List buildTextContents(@NotNull PsiElement root, @NotNull Set allowedDomains) { if (allowedDomains.contains(DOCUMENTATION)) { if (root instanceof PsiDocComment) { - return HtmlUtilsKt.removeHtml(javadocBuilder.excluding(e -> e instanceof PsiDocTagImpl).build(root, DOCUMENTATION)); + return HtmlUtilsKt.excludeHtml(javadocBuilder.excluding(e -> e instanceof PsiDocTagImpl).build(root, DOCUMENTATION)); } if (root instanceof PsiDocTagImpl) { - return HtmlUtilsKt.removeHtml(javadocBuilder.build(root, DOCUMENTATION)); + return HtmlUtilsKt.excludeHtml(javadocBuilder.build(root, DOCUMENTATION)); } } if (root instanceof PsiCommentImpl && allowedDomains.contains(COMMENTS)) { List roots = PsiUtilsKt.getNotSoDistantSimilarSiblings(root, e -> JAVA_PLAIN_COMMENT_BIT_SET.contains(PsiUtilCore.getElementType(e))); - return TextContent.joinWithWhitespace('\n', ContainerUtil.mapNotNull(roots, c -> - TextContentBuilder.FromPsi.removingIndents(" \t*/").removingLineSuffixes(" \t").build(c, COMMENTS))); + return ContainerUtil.createMaybeSingletonList( + TextContent.joinWithWhitespace('\n', ContainerUtil.mapNotNull(roots, c -> + TextContentBuilder.FromPsi.removingIndents(" \t*/").removingLineSuffixes(" \t").build(c, COMMENTS)))); } if (root instanceof PsiLiteralExpression && @@ -66,13 +67,13 @@ public class JavaTextExtractor extends TextExtractor { ContainerUtil.map(Text.allOccurrences(Pattern.compile("(?<=\n)" + "\\s{" + indent + "}"), content), Exclusion::exclude)); } content = content.excludeRanges(ContainerUtil.map(Text.allOccurrences(Pattern.compile("\\\\\n"), content), Exclusion::exclude)); - return content.trimWhitespace(); + return ContainerUtil.createMaybeSingletonList(content.trimWhitespace()); } - return content; + return ContainerUtil.createMaybeSingletonList(content); } - return null; + return List.of(); } } diff --git a/plugins/grazie/properties/src/main/kotlin/com/intellij/grazie/ide/language/properties/PropertyTextExtractor.java b/plugins/grazie/properties/src/main/kotlin/com/intellij/grazie/ide/language/properties/PropertyTextExtractor.java index 1221669d3155..7edc46c7b710 100644 --- a/plugins/grazie/properties/src/main/kotlin/com/intellij/grazie/ide/language/properties/PropertyTextExtractor.java +++ b/plugins/grazie/properties/src/main/kotlin/com/intellij/grazie/ide/language/properties/PropertyTextExtractor.java @@ -14,7 +14,6 @@ import com.intellij.psi.PsiElement; import com.intellij.psi.util.PsiUtilCore; import com.intellij.util.containers.ContainerUtil; import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; import java.util.List; import java.util.Set; @@ -28,13 +27,13 @@ public class PropertyTextExtractor extends TextExtractor { private static final Pattern trailingSlash = Pattern.compile("\\\\\n"); @Override - public @Nullable TextContent buildTextContent(@NotNull PsiElement root, - @NotNull Set allowedDomains) { + protected @NotNull List buildTextContents(@NotNull PsiElement root, @NotNull Set allowedDomains) { if (root instanceof PsiComment) { List roots = PsiUtilsKt.getNotSoDistantSimilarSiblings(root, e -> PropertiesTokenTypes.COMMENTS.contains(PsiUtilCore.getElementType(e))); - return TextContent.joinWithWhitespace('\n', ContainerUtil.mapNotNull(roots, c -> - TextContentBuilder.FromPsi.removingIndents(" \t#!").build(c, COMMENTS))); + return ContainerUtil.createMaybeSingletonList( + TextContent.joinWithWhitespace('\n', ContainerUtil.mapNotNull(roots, c -> + TextContentBuilder.FromPsi.removingIndents(" \t#!").build(c, COMMENTS)))); } if (PsiUtilCore.getElementType(root) == PropertiesTokenTypes.VALUE_CHARACTERS) { TextContent content = TextContent.builder().build(root, TextContent.TextDomain.PLAIN_TEXT); @@ -61,8 +60,8 @@ public class PropertyTextExtractor extends TextExtractor { } content = content.markUnknown(new TextRange(start, end)); } - return HtmlUtilsKt.removeHtml(content); + return HtmlUtilsKt.excludeHtml(content); } - return null; + return List.of(); } } diff --git a/plugins/grazie/src/main/kotlin/com/intellij/grazie/utils/HtmlUtils.kt b/plugins/grazie/src/main/kotlin/com/intellij/grazie/utils/HtmlUtils.kt index a3da646ced3c..058cf5c9e3a1 100644 --- a/plugins/grazie/src/main/kotlin/com/intellij/grazie/utils/HtmlUtils.kt +++ b/plugins/grazie/src/main/kotlin/com/intellij/grazie/utils/HtmlUtils.kt @@ -4,6 +4,7 @@ package com.intellij.grazie.utils import ai.grazie.nlp.utils.takeNonWhitespaces import com.intellij.grazie.text.TextContent import com.intellij.grazie.text.TextContent.Exclusion +import com.intellij.grazie.text.TextContent.ExclusionKind import com.intellij.openapi.progress.ProgressManager import com.intellij.openapi.util.TextRange import kotlinx.html.* @@ -32,9 +33,45 @@ var TD.valign: String fun FlowContent.nbsp() = +Entities.nbsp -private val anyTag = Pattern.compile("]*>") +private val anyTag = Pattern.compile("]*>") private val closingTag = Pattern.compile("") +@JvmField +val commonBlockElements: Set = + setOf("body", "p", "br", "td", "li", "title", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "table", "ol", "ul") + +private val commonMarkupElements = setOf("span", "i", "b", "u", "font", "a", "s", "strong", "sub", "sup") + +/** + * Remove HTML markup from a text, splitting it at block elements (like {@code

}), + * marking common HTML markup tags (like {@code }) as markup offsets, + * and replacing all other tags with unknown fragments. + */ +fun excludeHtml(content: TextContent?): List { + if (content == null) return emptyList() + + val components = ArrayList() + var lastComponentStart = 0 + var matchEnd = 0 + val matcher = anyTag.matcher(content) + while (matcher.find(matchEnd)) { + matchEnd = matcher.end() + ProgressManager.checkCanceled() + + val tagName = matcher.group(1) + if (tagName in commonBlockElements) { + content.subText(TextRange(lastComponentStart, matcher.start()))?.let(components::add) + lastComponentStart = matcher.end() + } + } + content.subText(TextRange(lastComponentStart, content.length))?.let(components::add) + + @Suppress("DEPRECATION") + return components.mapNotNull { removeHtml(it)?.trimWhitespace() } +} + +/** Remove HTML markup from a text, replacing it with unknown or markup (for some common HTML tags) offsets. */ +@Deprecated("use excludeHtml", ReplaceWith("excludeHtml")) fun removeHtml(_content: TextContent?): TextContent? { var content: TextContent = _content ?: return null @@ -52,6 +89,8 @@ fun removeHtml(_content: TextContent?): TextContent? { else null fun tagClosed(tagName: String) { + if (tagName in commonMarkupElements) return + val openingIndex = exclusions.indexOfLast { openingTagName(it.start, it.end) == tagName && content[it.end - 2] != '/' } if (openingIndex >= 0) { exclusions[openingIndex] = Exclusion.markUnknown(TextRange(exclusions[openingIndex].start, exclusions.last().end)) @@ -59,13 +98,21 @@ fun removeHtml(_content: TextContent?): TextContent? { } } - for (tagRange in Text.allOccurrences(anyTag, content)) { + var matchEnd = 0 + val matcher = anyTag.matcher(content) + while (matcher.find(matchEnd)) { + matchEnd = matcher.end() ProgressManager.checkCanceled() - if (closingTag.matcher(content.subSequence(tagRange.startOffset, tagRange.endOffset)).matches()) { - exclusions.add(Exclusion.markUnknown(tagRange)) - tagClosed(content.substring(tagRange.startOffset + 2, tagRange.endOffset - 1).trim()) - } else if (openingTagName(tagRange.startOffset, tagRange.endOffset) != null) { - exclusions.add(Exclusion.markUnknown(tagRange)) + val matchStart = matcher.start() + val tagName = matcher.group(1) + if (!tagName[0].isLetterOrDigit()) continue + + val exclusionKind = if (tagName in commonMarkupElements) ExclusionKind.markup else ExclusionKind.unknown + if (closingTag.matcher(content.subSequence(matchStart, matchEnd)).matches()) { + exclusions.add(Exclusion(matchStart, matchEnd, exclusionKind)) + tagClosed(content.substring(matchStart + 2, matchEnd - 1).trim()) + } else { + exclusions.add(Exclusion(matchStart, matchEnd, exclusionKind)) } } return content.excludeRanges(exclusions) diff --git a/plugins/grazie/src/test/kotlin/com/intellij/grazie/text/TextExtractionTest.java b/plugins/grazie/src/test/kotlin/com/intellij/grazie/text/TextExtractionTest.java index a778912b4aa1..05dfc0fe80d6 100644 --- a/plugins/grazie/src/test/kotlin/com/intellij/grazie/text/TextExtractionTest.java +++ b/plugins/grazie/src/test/kotlin/com/intellij/grazie/text/TextExtractionTest.java @@ -20,6 +20,7 @@ import com.intellij.psi.util.PsiTreeUtil; import com.intellij.psi.xml.XmlTag; import com.intellij.testFramework.fixtures.BasePlatformTestCase; import com.intellij.tools.ide.metrics.benchmark.Benchmark; +import com.intellij.util.containers.ContainerUtil; import kotlin.text.StringsKt; import one.util.streamex.IntStreamEx; import org.intellij.lang.regexp.RegExpLanguage; @@ -114,11 +115,12 @@ public class TextExtractionTest extends BasePlatformTestCase { } public void testBrokenPropertyMessageFormat() { - assertEquals("a |", unknownOffsets(extractText("a.properties", "a=a {0, choice, 1#1 code fragment|2#{0,number} code fragments", 4))); + assertEquals("a|", unknownOffsets(extractText("a.properties", "a=a {0, choice, 1#1 code fragment|2#{0,number} code fragments", 4))); } public void testExcludePropertyHtml() { - assertEquals("Hello |World", unknownOffsets(extractText("a.properties", "a=Hello

World", 8))); + List texts = extractTexts("a.properties", "a=Hello

World", 8, PsiElement.class); + assertEquals(List.of("Hello", "World"), ContainerUtil.map(texts, TextContentTest::unknownOffsets)); } public void testMultiLineCommentInProperties() { @@ -142,10 +144,11 @@ public class TextExtractionTest extends BasePlatformTestCase { * @return the offset of {@link #bar} in something * @throws Exception when something happens */"""; - TextContent text = extractText("a.java", docText, 6); - assertEquals("Hello |,\nhere's an asterisk: *\nand some |.\ntags1 |\ntags2 |\n|is unknown.", unknownOffsets(text)); + assertEquals( + List.of("Hello |,\nhere's an asterisk: *\nand some |.\ntags1 |\ntags2 |one| two", "three| four|\n|is unknown."), + ContainerUtil.map(extractTexts("a.java", docText, 6, PsiDocComment.class), TextContentTest::unknownOffsets)); - text = extractText("a.java", docText, docText.indexOf("the offset")); + TextContent text = extractText("a.java", docText, docText.indexOf("the offset")); assertEquals("the offset of in something", text.toString()); text = extractText("a.java", docText, docText.indexOf("without")); @@ -274,8 +277,11 @@ public class TextExtractionTest extends BasePlatformTestCase { } assertEquals("|abc|", unknownOffsets(extractText("a.xml", "abc", 4))); - assertEquals("|characters with markup\nand without it|", - unknownOffsets(extractText("a.xml", "and without it", 22))); + { + String text = "and without it"; + assertEquals("characters with markup", unknownOffsets(extractText("a.xml", text, 22))); + assertEquals("and without it|", unknownOffsets(extractText("a.xml", text, 45))); + } assertEquals("abcd efg", unknownOffsets(extractText("a.xml", "", 14))); assertEquals("comment", extractText("a.xml", "", 10).toString()); @@ -339,7 +345,7 @@ public class TextExtractionTest extends BasePlatformTestCase { PsiDocComment comment = PsiTreeUtil.findElementOfClassAtOffset(file, 10, PsiDocComment.class, false); TextExtractor extractor = new JavaTextExtractor(); Benchmark.newBenchmark("TextContent building with HTML removal", () -> { - assertEquals(expected, extractor.buildTextContent(comment, TextContent.TextDomain.ALL).toString()); + assertEquals(expected, assertOneElement(extractor.buildTextContents(comment, TextContent.TextDomain.ALL)).toString()); }).start(); } @@ -362,7 +368,7 @@ public class TextExtractionTest extends BasePlatformTestCase { var literal = PsiTreeUtil.findElementOfClassAtOffset(file, 100, PsiLiteralExpression.class, false); var extractor = new JavaTextExtractor(); Benchmark.newBenchmark("TextContent building from a long text fragment", () -> { - assertEquals(expected, extractor.buildTextContent(literal, TextContent.TextDomain.ALL).toString()); + assertEquals(expected, assertOneElement(extractor.buildTextContents(literal, TextContent.TextDomain.ALL)).toString()); }).start(); } @@ -374,7 +380,7 @@ public class TextExtractionTest extends BasePlatformTestCase { PsiElement tag = PsiTreeUtil.findElementOfClassAtOffset(file, text.indexOf("something"), PsiDocTag.class, false); Benchmark.newBenchmark("TextContent building from complex PSI", () -> { for (int i = 0; i < 10; i++) { - TextContent content = extractor.buildTextContent(tag, TextContent.TextDomain.ALL); + TextContent content = assertOneElement(extractor.buildTextContents(tag, TextContent.TextDomain.ALL)); assertEquals("something if is not too expensive", content.toString()); } }).start(); @@ -412,9 +418,17 @@ public class TextExtractionTest extends BasePlatformTestCase { return extractText(fileName, fileText, offset, getProject()); } - public static TextContent extractText(String fileName, String fileText, int offset, Project project) { + private List extractTexts(String fileName, String text, int offset, Class psi) { + PsiFile file = createFile(fileName, text, getProject()); + return TextExtractor.findTextsAt(PsiTreeUtil.findElementOfClassAtOffset(file, offset, psi, false), TextContent.TextDomain.ALL); + } + + private static PsiFile createFile(String fileName, String fileText, Project project) { FileType fileType = FileTypeManager.getInstance().getFileTypeByFileName(fileName); - PsiFile file = PsiFileFactory.getInstance(project).createFileFromText(fileName, fileType, fileText); - return TextExtractor.findTextAt(file, offset, TextContent.TextDomain.ALL); + return PsiFileFactory.getInstance(project).createFileFromText(fileName, fileType, fileText); + } + + public static TextContent extractText(String fileName, String fileText, int offset, Project project) { + return TextExtractor.findTextAt(createFile(fileName, fileText, project), offset, TextContent.TextDomain.ALL); } } diff --git a/plugins/grazie/src/test/testData/ide/language/java/Docs.java b/plugins/grazie/src/test/testData/ide/language/java/Docs.java index 84604cfb1967..a02206096366 100644 --- a/plugins/grazie/src/test/testData/ide/language/java/Docs.java +++ b/plugins/grazie/src/test/testData/ide/language/java/Docs.java @@ -17,7 +17,7 @@ class ExampleClassWithNoTypos { private String name; /** - * Creates an empty group. + * Creates an empty group. It's a react method. * * @param name The name of the group. And another sentence. */ @@ -60,10 +60,15 @@ class ExampleClassWithNoTypos { */ class ExampleClassWithTypos { + /** + * There can be many mistakes here. It add
+ * + * It add + */ private String name; /** - * Creates an empty group. + * Creates an empty group. It's a react method. * * @param name the name which group */ diff --git a/plugins/grazie/src/test/testData/ide/language/xml/Example.xml b/plugins/grazie/src/test/testData/ide/language/xml/Example.xml index 790e0aadf96a..c04bc51f878a 100644 --- a/plugins/grazie/src/test/testData/ide/language/xml/Example.xml +++ b/plugins/grazie/src/test/testData/ide/language/xml/Example.xml @@ -13,4 +13,11 @@ System.out.println("Hello " + name) Hello John + So that this is possible. And this is an mistake. + ]]> + + There is a possibility of such thing so that this is possible. + diff --git a/plugins/grazie/xml/main/kotlin/com/intellij/grazie/ide/language/xml/XmlTextExtractor.java b/plugins/grazie/xml/main/kotlin/com/intellij/grazie/ide/language/xml/XmlTextExtractor.java index 27b698f665e7..72c27e9afbc1 100644 --- a/plugins/grazie/xml/main/kotlin/com/intellij/grazie/ide/language/xml/XmlTextExtractor.java +++ b/plugins/grazie/xml/main/kotlin/com/intellij/grazie/ide/language/xml/XmlTextExtractor.java @@ -29,7 +29,6 @@ import com.intellij.psi.util.PsiUtilCore; import com.intellij.psi.xml.*; import com.intellij.util.containers.ContainerUtil; import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; import java.util.*; import java.util.function.Function; @@ -49,35 +48,34 @@ public class XmlTextExtractor extends TextExtractor { } @Override - protected @Nullable TextContent buildTextContent(@NotNull PsiElement element, - @NotNull Set allowedDomains) { + protected @NotNull List buildTextContents(@NotNull PsiElement element, @NotNull Set allowedDomains) { if (isText(element) && hasSuitableDialect(element)) { var classifier = tagClassifier(element); PsiElement container = SyntaxTraverser.psiApi().parents(element) .find(e -> e instanceof XmlDocument || e instanceof XmlTag && classifier.apply((XmlTag)e) != TagKind.Inline); if (container != null) { - Map contentsInside = CachedValuesManager.getCachedValue(container, () -> + Map> contentsInside = CachedValuesManager.getCachedValue(container, () -> CachedValueProvider.Result.create(calcContents(container), container)); - return contentsInside.get(element); + return contentsInside.getOrDefault(element, List.of()); } } IElementType type = PsiUtilCore.getElementType(element); if (type == XmlTokenType.XML_COMMENT_CHARACTERS && allowedDomains.contains(COMMENTS) && hasSuitableDialect(element)) { - return builder.build(element, COMMENTS); + return ContainerUtil.createMaybeSingletonList(builder.build(element, COMMENTS)); } if (type == XmlTokenType.XML_ATTRIBUTE_VALUE_TOKEN && allowedDomains.contains(LITERALS) && hasSuitableDialect(element)) { TextContent content = builder.build(element, LITERALS); if (content != null && seemsNatural(content)) { - return content; + return List.of(content); } } - return null; + return List.of(); } - private @NotNull Map calcContents(PsiElement container) { + private @NotNull Map> calcContents(PsiElement container) { if (container instanceof XmlTag && isNonText((XmlTag)container)) { return Collections.emptyMap(); } @@ -88,7 +86,7 @@ public class XmlTextExtractor extends TextExtractor { var fullContent = NotNullLazyValue.lazy(() -> TextContent.psiFragment(PLAIN_TEXT, container)); var visitor = new PsiRecursiveElementWalkingVisitor() { - final Map result = new HashMap<>(); + final Map> result = new HashMap<>(); final List group = new ArrayList<>(); final Set markupIndices = new HashSet<>(); final Set unknownIndices = new HashSet<>(); @@ -115,7 +113,17 @@ public class XmlTextExtractor extends TextExtractor { } if (isText(each)) { - group.add(each); + if (isCdata(each.getParent())) { + List contents = HtmlUtilsKt.excludeHtml( + extractRange(each.getTextRange().shiftLeft(container.getTextRange().getStartOffset()))); + if (!contents.isEmpty()) { // isolate CDATA into its own TextContent set for now; maybe glue to the surrounding texts later + flushGroup(false); + result.put(each, contents); + unknownBefore = false; + } + } else { + group.add(each); + } } else if (PsiUtilCore.getElementType(each) == XmlTokenType.XML_CHAR_ENTITY_REF) { if (HtmlUtilsKt.isSpaceEntity(each.getText())) { @@ -127,6 +135,11 @@ public class XmlTextExtractor extends TextExtractor { super.visitElement(each); } + private TextContent extractRange(TextRange range) { + TextContent full = fullContent.getValue(); + return full.excludeRange(new TextRange(range.getEndOffset(), full.length())).excludeRange(new TextRange(0, range.getStartOffset())); + } + @Override protected void elementFinished(PsiElement element) { super.elementFinished(element); @@ -140,7 +153,7 @@ public class XmlTextExtractor extends TextExtractor { List components = new ArrayList<>(group.size()); for (int i = 0; i < group.size(); i++) { PsiElement e = group.get(i); - TextContent component = extractRange(fullContent.getValue(), e.getTextRange().shiftLeft(containerStart)); + TextContent component = extractRange(e.getTextRange().shiftLeft(containerStart)); component = applyExclusions(i, component, markupIndices, ExclusionKind.markup); component = applyExclusions(i, component, unknownIndices, ExclusionKind.unknown); components.add(component); @@ -152,7 +165,7 @@ public class XmlTextExtractor extends TextExtractor { content = HtmlUtilsKt.inlineSpaceEntities(content.removeIndents(Set.of(' ', '\t'))); if (content != null) { for (PsiElement e : group) { - result.put(e, content); + result.put(e, List.of(content)); } } } @@ -178,15 +191,9 @@ public class XmlTextExtractor extends TextExtractor { return content.toString().contains(" "); } - private static TextContent extractRange(TextContent full, TextRange range) { - return full.excludeRange(new TextRange(range.getEndOffset(), full.length())).excludeRange(new TextRange(0, range.getStartOffset())); - } - private static boolean isText(PsiElement leaf) { PsiElement parent = leaf.getParent(); - if (!(parent instanceof XmlText) && - !(PsiUtilCore.getElementType(parent) == XmlElementType.XML_CDATA && parent.getParent() instanceof XmlText) && - !(parent instanceof XmlDocument)) { + if (!(parent instanceof XmlText) && !isCdata(parent) && !(parent instanceof XmlDocument)) { return false; } @@ -195,6 +202,10 @@ public class XmlTextExtractor extends TextExtractor { type == XmlTokenType.XML_DATA_CHARACTERS; } + private static boolean isCdata(PsiElement element) { + return PsiUtilCore.getElementType(element) == XmlElementType.XML_CDATA; + } + private boolean hasSuitableDialect(@NotNull PsiElement element) { return myEnabledDialects.contains(element.getContainingFile().getLanguage().getClass()); } @@ -216,9 +227,6 @@ public class XmlTextExtractor extends TextExtractor { super(HTMLLanguage.class); } - private static final Set DEFINITELY_BLOCK_TAGS = - Set.of("body", "p", "br", "td", "li", "title", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "table"); - @Override protected Function tagClassifier(@NotNull PsiElement context) { if (!Registry.is("grazie.html.concatenate.inline.tag.contents")) { @@ -230,7 +238,7 @@ public class XmlTextExtractor extends TextExtractor { return tag -> { String name = tag.getName(); if (NON_TEXT_TAGS.contains(name)) return TagKind.Unknown; - if (DEFINITELY_BLOCK_TAGS.contains(name)) return TagKind.Block; + if (HtmlUtilsKt.commonBlockElements.contains(name)) return TagKind.Block; if (inlineTags.contains(name)) return TagKind.Inline; return TagKind.Unknown; };