master-inspectopedia-collector

Cleaner checks for br element

Merge branch 'master' into master-inspectopedia-collector

Collecting CWE ids


Merge-request: IJ-MR-126396
Merged-by: Egor Malyshev <egor.malyshev@jetbrains.com>

GitOrigin-RevId: 6bec313f9bf5a255bd81e922ed2e0b34fc6e4304
This commit is contained in:
Egor Malyshev
2024-02-19 14:34:45 +00:00
committed by intellij-monorepo-bot
parent 0b5363a00e
commit 63d7aa8192
3 changed files with 136 additions and 126 deletions

View File

@@ -8,8 +8,7 @@ import com.fasterxml.jackson.databind.SerializationFeature;
import com.fasterxml.jackson.databind.json.JsonMapper;
import com.intellij.codeInspection.InspectionEP;
import com.intellij.codeInspection.InspectionProfileEntry;
import com.intellij.codeInspection.ex.InspectionToolWrapper;
import com.intellij.codeInspection.ex.ScopeToolState;
import com.intellij.codeInspection.ex.*;
import com.intellij.codeInspection.options.*;
import com.intellij.ide.plugins.PluginManager;
import com.intellij.inspectopedia.extractor.data.Inspection;
@@ -18,6 +17,7 @@ import com.intellij.inspectopedia.extractor.data.Plugin;
import com.intellij.inspectopedia.extractor.data.Plugins;
import com.intellij.inspectopedia.extractor.utils.HtmlUtils;
import com.intellij.openapi.application.ApplicationInfo;
import com.intellij.openapi.application.ApplicationManager;
import com.intellij.openapi.application.ApplicationStarter;
import com.intellij.openapi.diagnostic.Logger;
import com.intellij.openapi.project.Project;
@@ -97,6 +97,11 @@ final class InspectopediaExtractor implements ApplicationStarter {
availablePlugins.put(IDE_NAME, new Plugin(IDE_NAME, IDE_NAME, IDE_VERSION));
final InspectionMetaInformationService
service = ApplicationManager.getApplication().getService(InspectionMetaInformationService.class);
final MetaInformationState inspectionsExtraState = service == null ? null : (MetaInformationState)service.getState(null);
for (final ScopeToolState scopeToolState : scopeToolStates) {
final InspectionToolWrapper<?, ?> wrapper = scopeToolState.getTool();
@@ -118,13 +123,16 @@ final class InspectopediaExtractor implements ApplicationStarter {
catch (Throwable t) {
LOG.info("Cannot create options panel " + wrapper.getShortName(), t);
}
final MetaInformation metaInformation = inspectionsExtraState == null ? null : inspectionsExtraState.getInspections().get(wrapper.getID());
final List<Integer> cweIds = metaInformation == null ? null : metaInformation.getCweIds();
final String language = wrapper.getLanguage();
final String briefDescription = HtmlUtils.cleanupHtml(description[0], language);
final String extendedDescription = description.length > 1 ? HtmlUtils.cleanupHtml(description[1], language) : null;
final Inspection inspection = new Inspection(wrapper.getShortName(), wrapper.getDisplayName(), wrapper.getDefaultLevel().getName(),
language, briefDescription,
extendedDescription, Arrays.asList(wrapper.getGroupPath()), wrapper.applyToDialects(),
wrapper.isCleanupTool(), wrapper.isEnabledByDefault(), panelInfo);
wrapper.isCleanupTool(), wrapper.isEnabledByDefault(), panelInfo, cweIds);
availablePlugins.get(pluginId).addInspection(inspection);
}

View File

@@ -27,6 +27,7 @@ public class Inspection implements Comparable<Inspection> {
public String extendedDescription = "";
public boolean hasOptionsPanel = false;
public List<OptionsPanelInfo> options = null;
public List<Integer> cweIds = null;
public Inspection(String id,
String name,
@@ -38,7 +39,8 @@ public class Inspection implements Comparable<Inspection> {
boolean appliesToDialects,
boolean partOfCodeCleanup,
boolean enabledByDefault,
List<OptionsPanelInfo> options) {
List<OptionsPanelInfo> options,
List<Integer> cweIds) {
this.id = id;
this.name = name;
this.severity = severity;
@@ -51,6 +53,7 @@ public class Inspection implements Comparable<Inspection> {
this.isEnabledDefault = enabledByDefault;
this.hasOptionsPanel = options != null;
this.options = options;
this.cweIds = cweIds;
}
public Inspection() {

View File

@@ -17,130 +17,129 @@ import java.util.Iterator;
import java.util.List;
public final class HtmlUtils {
public static final Safelist SAFELIST = new Safelist();
public static final Safelist SAFELIST = new Safelist();
static {
SAFELIST.addTags("a", "b", "code", "i", "li", "list", "p", "s", "u");
static {
SAFELIST.addTags("a", "b", "code", "i", "li", "list", "p", "s", "u");
}
private static final List<Pair<String, String>> RENAME_MAP = List.of(
Pair.create("ul", "list"),
Pair.create("th", "td"),
Pair.create("c", "code"),
Pair.create("strong", "b"),
Pair.create("small", "font"),
Pair.create("span", "control"),
Pair.create("blockquote", "tip"),
Pair.create("em", "i")
);
private static final List<String> REMOVE_MAP = List.of(
"hr",
"br",
"code:matches(^\\s*$)"
);
private static final List<String> UNWRAP_MAP = List.of(
"tbody",
"pre",
"code[style=block] > *"
);
@NotNull
public static String cleanupHtml(@NotNull String source, @Nullable String languageForCodeBlocks) {
final Document document = Jsoup.parse(source);
RENAME_MAP.forEach(map -> document.select(map.first).tagName(map.second));
final Elements ol = document.select("ol");
ol.tagName("list");
ol.attr("style", "decimal");
UNWRAP_MAP.forEach(map -> document.select(map).unwrap());
final Elements codeBlock = document.select("pre > code");
codeBlock.attr("style", "block");
codeBlock.attr("lang", languageForCodeBlocks == null ? "Text" : languageForCodeBlocks);
document.select("code > *").stream()
.filter(element -> !element.tagName().equals("a"))
.forEach(Node::unwrap);
document.select("br").stream().map(Element::parent)
.distinct()
.forEach(parent -> {
final List<Pair<Boolean, List<Node>>> groups = new ArrayList<>();
final List<Node> inlineElements = new ArrayList<>();
final Iterator<Node> childNodes = parent.childNodes().iterator();
while (childNodes.hasNext()) {
final Node childNode = childNodes.next();
if (!isBlockElement(childNode)) {
inlineElements.add(childNode);
}
if (isBlockElement(childNode) || !childNodes.hasNext()) {
if (!inlineElements.isEmpty()) {
groups.add(Pair.create(true, List.copyOf(inlineElements)));
inlineElements.clear();
}
}
if (isBlockElement(childNode) && !isBr(childNode)) {
groups.add(Pair.create(false, List.of(childNode)));
}
}
for (Pair<Boolean, List<Node>> group : groups) {
final Boolean shouldWrap = group.getFirst();
final List<Node> nodes = group.getSecond();
final Element elementForNodes = shouldWrap ? document.createElement("p") : parent;
nodes.forEach(n -> {
n.remove();
elementForNodes.appendChild(n);
});
if (shouldWrap) {
parent.appendChild(elementForNodes);
}
}
});
REMOVE_MAP.forEach(map -> document.select(map).remove());
Elements paragraphsWithParagraphs;
//What if there are hypothetically many nested P, and we're going to miss them with only one iteration?
do {
paragraphsWithParagraphs = document.select("p:has(p)");
paragraphsWithParagraphs.unwrap();
}
while (!paragraphsWithParagraphs.isEmpty());
//And then there were multi nested paragraphs which deep down contained nothing but whitespace? Now they're ready for removal as well :)
final Elements emptyParagraphs = document.select("p:matches(^\\s*$)");
emptyParagraphs.remove();
final Cleaner cleaner = new Cleaner(SAFELIST);
cleaner.clean(document);
document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
document.outputSettings().prettyPrint(false);
return document.body().html();
}
private static boolean isBlockElement(@NotNull Node node) {
if (!(node instanceof Element element)) {
return false;
}
private static final List<Pair<String, String>> RENAME_MAP = List.of(
Pair.create("ul", "list"),
Pair.create("th", "td"),
Pair.create("c", "code"),
Pair.create("strong", "b"),
Pair.create("small", "font"),
Pair.create("span", "control"),
Pair.create("blockquote", "tip"),
Pair.create("em", "i")
);
return "list".equals(element.tagName()) ||
("code".equals(element.tagName()) && "block".equals(element.attr("style"))) ||
isBr(node);
}
private static final List<String> REMOVE_MAP = List.of(
"hr",
"br",
"code:matches(^\\s*$)"
);
private static final List<String> UNWRAP_MAP = List.of(
"tbody",
"pre",
"code[style=block] > *"
);
@NotNull
public static String cleanupHtml(@NotNull String source, @Nullable String languageForCodeBlocks) {
final Document document = Jsoup.parse(source);
RENAME_MAP.forEach(map -> document.select(map.first).tagName(map.second));
final Elements ol = document.select("ol");
ol.tagName("list");
ol.attr("style", "decimal");
UNWRAP_MAP.forEach(map -> document.select(map).unwrap());
final Elements codeBlock = document.select("pre > code");
codeBlock.attr("style", "block");
codeBlock.attr("lang", languageForCodeBlocks == null ? "Text" : languageForCodeBlocks);
document.select("code > *").stream()
.filter(element -> !element.tagName().equals("a"))
.forEach(Node::unwrap);
document.select("br").stream().map(Element::parent)
.distinct()
.forEach(parent -> {
final List<Pair<Boolean, List<Node>>> groups = new ArrayList<>();
final List<Node> inlineElements = new ArrayList<>();
final Iterator<Node> childNodes = parent.childNodes().iterator();
while (childNodes.hasNext()) {
final Node childNode = childNodes.next();
if (!isBlockElement(childNode)) {
inlineElements.add(childNode);
}
if (isBlockElement(childNode) || !childNodes.hasNext()) {
if (!inlineElements.isEmpty()) {
groups.add(Pair.create(true, List.copyOf(inlineElements)));
inlineElements.clear();
}
}
if (isBlockElement(childNode) && !isBr(childNode)) {
groups.add(Pair.create(false, List.of(childNode)));
}
}
for (Pair<Boolean, List<Node>> group : groups) {
final Boolean shouldWrap = group.getFirst();
final List<Node> nodes = group.getSecond();
final Element elementForNodes = shouldWrap ? document.createElement("p") : parent;
nodes.forEach(n -> {
n.remove();
elementForNodes.appendChild(n);
});
if (shouldWrap) {
parent.appendChild(elementForNodes);
}
}
});
REMOVE_MAP.forEach(map -> document.select(map).remove());
Elements paragraphsWithParagraphs;
//What if there are hypothetically many nested P, and we're going to miss them with only one iteration?
do {
paragraphsWithParagraphs = document.select("p:has(p)");
paragraphsWithParagraphs.unwrap();
} while (!paragraphsWithParagraphs.isEmpty());
//And then there were multi nested paragraphs which deep down contained nothing but whitespace? Now they're ready for removal as well :)
final Elements emptyParagraphs = document.select("p:matches(^\\s*$)");
emptyParagraphs.remove();
final Cleaner cleaner = new Cleaner(SAFELIST);
cleaner.clean(document);
document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
document.outputSettings().prettyPrint(false);
return document.body().html();
}
private static boolean isBlockElement(@NotNull Node node) {
if (!(node instanceof Element element))
return false;
return element.tagName().equals("list") ||
(element.tagName().equals("code") && element.attr("style").equals("block")) ||
isBr(node);
}
private static boolean isBr(@NotNull Node node) {
if (!(node instanceof Element))
return false;
return "br".equals(((Element) node).tagName());
}
private static boolean isBr(@NotNull Node node) {
return node instanceof Element element && "br".equals(element.tagName());
}
}