mirror of
https://gitflic.ru/project/openide/openide.git
synced 2026-03-22 15:19:59 +07:00
regex: fix handling of surrogate pairs in escaped unicode characters
GitOrigin-RevId: fac7582466fec09d569153bd0ee865821c82c41c
This commit is contained in:
committed by
intellij-monorepo-bot
parent
fb49c35cbb
commit
489e253dc5
@@ -21,6 +21,7 @@ import com.intellij.lang.PsiBuilder;
|
||||
import com.intellij.lang.PsiParser;
|
||||
import com.intellij.psi.tree.IElementType;
|
||||
import com.intellij.psi.tree.TokenSet;
|
||||
import org.intellij.lang.regexp.psi.impl.RegExpCharImpl;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
@@ -526,6 +527,22 @@ public class RegExpParser implements PsiParser, LightPsiParser {
|
||||
checkMatches(builder, RegExpTT.RBRACE, "'}' expected");
|
||||
marker.done(RegExpElementTypes.NAMED_CHARACTER);
|
||||
}
|
||||
else if (builder.getTokenType() == RegExpTT.UNICODE_CHAR) {
|
||||
final String text1 = builder.getTokenText();
|
||||
assert text1 != null;
|
||||
final int value1 = RegExpCharImpl.unescapeChar(text1);
|
||||
builder.advanceLexer();
|
||||
// merge surrogate pairs into single regexp char
|
||||
if (!Character.isSupplementaryCodePoint(value1) && Character.isHighSurrogate((char)value1)) {
|
||||
final String text2 = builder.getTokenText();
|
||||
assert text2 != null;
|
||||
final int value2 = RegExpCharImpl.unescapeChar(text2);
|
||||
if (!Character.isSupplementaryCodePoint(value2) && Character.isLowSurrogate((char)value2)) {
|
||||
builder.advanceLexer();
|
||||
}
|
||||
}
|
||||
marker.done(RegExpElementTypes.CHAR);
|
||||
}
|
||||
else {
|
||||
builder.advanceLexer();
|
||||
marker.done(RegExpElementTypes.CHAR);
|
||||
|
||||
@@ -57,12 +57,26 @@ public class RegExpCharImpl extends RegExpElementImpl implements RegExpChar {
|
||||
|
||||
@Override
|
||||
public int getValue() {
|
||||
final String s = getUnescapedText();
|
||||
if (s.equals("\\") && getType() == Type.CHAR) return '\\';
|
||||
return unescapeChar(s);
|
||||
final ASTNode node = getNode();
|
||||
final IElementType type = node.getFirstChildNode().getElementType();
|
||||
if (type == RegExpTT.BAD_OCT_VALUE ||
|
||||
type == RegExpTT.BAD_HEX_VALUE ||
|
||||
type == RegExpTT.BAD_CHARACTER ||
|
||||
type == StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN) {
|
||||
return -1;
|
||||
}
|
||||
final String text = getUnescapedText();
|
||||
if (text.length() == 1 && type == RegExpTT.CHARACTER) {
|
||||
return text.codePointAt(0);
|
||||
}
|
||||
else if (type == RegExpTT.UNICODE_CHAR) {
|
||||
final int i = text.indexOf('\\', 1);
|
||||
if (i >= 0) return Character.toCodePoint((char)unescapeChar(text.substring(0, i)), (char)unescapeChar(text.substring(i)));
|
||||
}
|
||||
return unescapeChar(text);
|
||||
}
|
||||
|
||||
private static int unescapeChar(String s) {
|
||||
public static int unescapeChar(String s) {
|
||||
final int length = s.length();
|
||||
assert length > 0;
|
||||
|
||||
|
||||
@@ -91,39 +91,16 @@ public final class RegExpAnnotator extends RegExpElementVisitor implements Annot
|
||||
if (to == null) {
|
||||
return;
|
||||
}
|
||||
int fromCodePoint = from.getValue();
|
||||
int toCodePoint = to.getValue();
|
||||
final int fromCodePoint = from.getValue();
|
||||
final int toCodePoint = to.getValue();
|
||||
if (fromCodePoint == -1 || toCodePoint == -1) {
|
||||
return;
|
||||
}
|
||||
int errorStart = range.getTextOffset();
|
||||
int errorEnd = errorStart + range.getTextLength();
|
||||
// \ud800\udc00-\udbff\udfff
|
||||
if (!Character.isSupplementaryCodePoint(fromCodePoint) && Character.isLowSurrogate((char)fromCodePoint)) {
|
||||
final PsiElement prevSibling = range.getPrevSibling();
|
||||
if (prevSibling instanceof RegExpChar) {
|
||||
final int prevSiblingValue = ((RegExpChar)prevSibling).getValue();
|
||||
if (!Character.isSupplementaryCodePoint(prevSiblingValue) && Character.isHighSurrogate((char)prevSiblingValue)) {
|
||||
fromCodePoint = Character.toCodePoint((char)prevSiblingValue, (char)fromCodePoint);
|
||||
errorStart -= prevSibling.getTextLength();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!Character.isSupplementaryCodePoint(toCodePoint) && Character.isHighSurrogate((char)toCodePoint)) {
|
||||
final PsiElement nextSibling = range.getNextSibling();
|
||||
if (nextSibling instanceof RegExpChar) {
|
||||
final int nextSiblingValue = ((RegExpChar)nextSibling).getValue();
|
||||
if (!Character.isSupplementaryCodePoint(nextSiblingValue) && Character.isLowSurrogate((char)nextSiblingValue)) {
|
||||
toCodePoint = Character.toCodePoint((char)toCodePoint, (char)nextSiblingValue);
|
||||
errorEnd += nextSibling.getTextLength();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (toCodePoint < fromCodePoint) {
|
||||
myHolder.createErrorAnnotation(new TextRange(errorStart, errorEnd), "Illegal character range (to < from)");
|
||||
myHolder.newAnnotation(HighlightSeverity.ERROR, "Illegal character range (to < from)").range(range).create();
|
||||
}
|
||||
else if (toCodePoint == fromCodePoint) {
|
||||
myHolder.createWarningAnnotation(new TextRange(errorStart, errorEnd), "Redundant character range");
|
||||
myHolder.newAnnotation(HighlightSeverity.WARNING, "Redundant character range").range(range).create();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,14 +3,12 @@ REGEXP_FILE
|
||||
RegExpBranchImpl: <[\ud800\udc00-\udbff\udfff]>
|
||||
RegExpClassImpl: <[\ud800\udc00-\udbff\udfff]>
|
||||
PsiElement(CLASS_BEGIN)('[')
|
||||
RegExpCharImpl: <\ud800>
|
||||
PsiElement(UNICODE_CHAR)('\ud800')
|
||||
RegExpCharRangeImpl: <\udc00-\udbff>
|
||||
RegExpCharImpl: <\udc00>
|
||||
RegExpCharRangeImpl: <\ud800\udc00-\udbff\udfff>
|
||||
RegExpCharImpl: <\ud800\udc00>
|
||||
PsiElement(UNICODE_CHAR)('\ud800')
|
||||
PsiElement(UNICODE_CHAR)('\udc00')
|
||||
PsiElement(MINUS)('-')
|
||||
RegExpCharImpl: <\udbff>
|
||||
RegExpCharImpl: <\udbff\udfff>
|
||||
PsiElement(UNICODE_CHAR)('\udbff')
|
||||
RegExpCharImpl: <\udfff>
|
||||
PsiElement(UNICODE_CHAR)('\udfff')
|
||||
PsiElement(UNICODE_CHAR)('\udfff')
|
||||
PsiElement(CLASS_END)(']')
|
||||
@@ -3,14 +3,12 @@ REGEXP_FILE
|
||||
RegExpBranchImpl: <[\udbff\udfff-\ud800\udc00]>
|
||||
RegExpClassImpl: <[\udbff\udfff-\ud800\udc00]>
|
||||
PsiElement(CLASS_BEGIN)('[')
|
||||
RegExpCharImpl: <\udbff>
|
||||
PsiElement(UNICODE_CHAR)('\udbff')
|
||||
RegExpCharRangeImpl: <\udfff-\ud800>
|
||||
RegExpCharImpl: <\udfff>
|
||||
RegExpCharRangeImpl: <\udbff\udfff-\ud800\udc00>
|
||||
RegExpCharImpl: <\udbff\udfff>
|
||||
PsiElement(UNICODE_CHAR)('\udbff')
|
||||
PsiElement(UNICODE_CHAR)('\udfff')
|
||||
PsiElement(MINUS)('-')
|
||||
RegExpCharImpl: <\ud800>
|
||||
RegExpCharImpl: <\ud800\udc00>
|
||||
PsiElement(UNICODE_CHAR)('\ud800')
|
||||
RegExpCharImpl: <\udc00>
|
||||
PsiElement(UNICODE_CHAR)('\udc00')
|
||||
PsiElement(UNICODE_CHAR)('\udc00')
|
||||
PsiElement(CLASS_END)(']')
|
||||
@@ -3,14 +3,12 @@ REGEXP_FILE
|
||||
RegExpBranchImpl: <[\ud800\udc00-\udbff\udfff]>
|
||||
RegExpClassImpl: <[\ud800\udc00-\udbff\udfff]>
|
||||
PsiElement(CLASS_BEGIN)('[')
|
||||
RegExpCharImpl: <\ud800>
|
||||
PsiElement(UNICODE_CHAR)('\ud800')
|
||||
RegExpCharRangeImpl: <\udc00-\udbff>
|
||||
RegExpCharImpl: <\udc00>
|
||||
RegExpCharRangeImpl: <\ud800\udc00-\udbff\udfff>
|
||||
RegExpCharImpl: <\ud800\udc00>
|
||||
PsiElement(UNICODE_CHAR)('\ud800')
|
||||
PsiElement(UNICODE_CHAR)('\udc00')
|
||||
PsiElement(MINUS)('-')
|
||||
RegExpCharImpl: <\udbff>
|
||||
RegExpCharImpl: <\udbff\udfff>
|
||||
PsiElement(UNICODE_CHAR)('\udbff')
|
||||
RegExpCharImpl: <\udfff>
|
||||
PsiElement(UNICODE_CHAR)('\udfff')
|
||||
PsiElement(UNICODE_CHAR)('\udfff')
|
||||
PsiElement(CLASS_END)(']')
|
||||
Reference in New Issue
Block a user