regex: fix handling of surrogate pairs in escaped unicode characters

GitOrigin-RevId: fac7582466fec09d569153bd0ee865821c82c41c
This commit is contained in:
Bas Leijdekkers
2020-02-26 18:45:36 +01:00
committed by intellij-monorepo-bot
parent fb49c35cbb
commit 489e253dc5
6 changed files with 54 additions and 52 deletions

View File

@@ -21,6 +21,7 @@ import com.intellij.lang.PsiBuilder;
import com.intellij.lang.PsiParser;
import com.intellij.psi.tree.IElementType;
import com.intellij.psi.tree.TokenSet;
import org.intellij.lang.regexp.psi.impl.RegExpCharImpl;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
@@ -526,6 +527,22 @@ public class RegExpParser implements PsiParser, LightPsiParser {
checkMatches(builder, RegExpTT.RBRACE, "'}' expected");
marker.done(RegExpElementTypes.NAMED_CHARACTER);
}
else if (builder.getTokenType() == RegExpTT.UNICODE_CHAR) {
final String text1 = builder.getTokenText();
assert text1 != null;
final int value1 = RegExpCharImpl.unescapeChar(text1);
builder.advanceLexer();
// merge surrogate pairs into single regexp char
if (!Character.isSupplementaryCodePoint(value1) && Character.isHighSurrogate((char)value1)) {
final String text2 = builder.getTokenText();
assert text2 != null;
final int value2 = RegExpCharImpl.unescapeChar(text2);
if (!Character.isSupplementaryCodePoint(value2) && Character.isLowSurrogate((char)value2)) {
builder.advanceLexer();
}
}
marker.done(RegExpElementTypes.CHAR);
}
else {
builder.advanceLexer();
marker.done(RegExpElementTypes.CHAR);

View File

@@ -57,12 +57,26 @@ public class RegExpCharImpl extends RegExpElementImpl implements RegExpChar {
@Override
public int getValue() {
final String s = getUnescapedText();
if (s.equals("\\") && getType() == Type.CHAR) return '\\';
return unescapeChar(s);
final ASTNode node = getNode();
final IElementType type = node.getFirstChildNode().getElementType();
if (type == RegExpTT.BAD_OCT_VALUE ||
type == RegExpTT.BAD_HEX_VALUE ||
type == RegExpTT.BAD_CHARACTER ||
type == StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN) {
return -1;
}
final String text = getUnescapedText();
if (text.length() == 1 && type == RegExpTT.CHARACTER) {
return text.codePointAt(0);
}
else if (type == RegExpTT.UNICODE_CHAR) {
final int i = text.indexOf('\\', 1);
if (i >= 0) return Character.toCodePoint((char)unescapeChar(text.substring(0, i)), (char)unescapeChar(text.substring(i)));
}
return unescapeChar(text);
}
private static int unescapeChar(String s) {
public static int unescapeChar(String s) {
final int length = s.length();
assert length > 0;

View File

@@ -91,39 +91,16 @@ public final class RegExpAnnotator extends RegExpElementVisitor implements Annot
if (to == null) {
return;
}
int fromCodePoint = from.getValue();
int toCodePoint = to.getValue();
final int fromCodePoint = from.getValue();
final int toCodePoint = to.getValue();
if (fromCodePoint == -1 || toCodePoint == -1) {
return;
}
int errorStart = range.getTextOffset();
int errorEnd = errorStart + range.getTextLength();
// \ud800\udc00-\udbff\udfff
if (!Character.isSupplementaryCodePoint(fromCodePoint) && Character.isLowSurrogate((char)fromCodePoint)) {
final PsiElement prevSibling = range.getPrevSibling();
if (prevSibling instanceof RegExpChar) {
final int prevSiblingValue = ((RegExpChar)prevSibling).getValue();
if (!Character.isSupplementaryCodePoint(prevSiblingValue) && Character.isHighSurrogate((char)prevSiblingValue)) {
fromCodePoint = Character.toCodePoint((char)prevSiblingValue, (char)fromCodePoint);
errorStart -= prevSibling.getTextLength();
}
}
}
if (!Character.isSupplementaryCodePoint(toCodePoint) && Character.isHighSurrogate((char)toCodePoint)) {
final PsiElement nextSibling = range.getNextSibling();
if (nextSibling instanceof RegExpChar) {
final int nextSiblingValue = ((RegExpChar)nextSibling).getValue();
if (!Character.isSupplementaryCodePoint(nextSiblingValue) && Character.isLowSurrogate((char)nextSiblingValue)) {
toCodePoint = Character.toCodePoint((char)toCodePoint, (char)nextSiblingValue);
errorEnd += nextSibling.getTextLength();
}
}
}
if (toCodePoint < fromCodePoint) {
myHolder.createErrorAnnotation(new TextRange(errorStart, errorEnd), "Illegal character range (to < from)");
myHolder.newAnnotation(HighlightSeverity.ERROR, "Illegal character range (to < from)").range(range).create();
}
else if (toCodePoint == fromCodePoint) {
myHolder.createWarningAnnotation(new TextRange(errorStart, errorEnd), "Redundant character range");
myHolder.newAnnotation(HighlightSeverity.WARNING, "Redundant character range").range(range).create();
}
}

View File

@@ -3,14 +3,12 @@ REGEXP_FILE
RegExpBranchImpl: <[\ud800\udc00-\udbff\udfff]>
RegExpClassImpl: <[\ud800\udc00-\udbff\udfff]>
PsiElement(CLASS_BEGIN)('[')
RegExpCharImpl: <\ud800>
PsiElement(UNICODE_CHAR)('\ud800')
RegExpCharRangeImpl: <\udc00-\udbff>
RegExpCharImpl: <\udc00>
RegExpCharRangeImpl: <\ud800\udc00-\udbff\udfff>
RegExpCharImpl: <\ud800\udc00>
PsiElement(UNICODE_CHAR)('\ud800')
PsiElement(UNICODE_CHAR)('\udc00')
PsiElement(MINUS)('-')
RegExpCharImpl: <\udbff>
RegExpCharImpl: <\udbff\udfff>
PsiElement(UNICODE_CHAR)('\udbff')
RegExpCharImpl: <\udfff>
PsiElement(UNICODE_CHAR)('\udfff')
PsiElement(UNICODE_CHAR)('\udfff')
PsiElement(CLASS_END)(']')

View File

@@ -3,14 +3,12 @@ REGEXP_FILE
RegExpBranchImpl: <[\udbff\udfff-\ud800\udc00]>
RegExpClassImpl: <[\udbff\udfff-\ud800\udc00]>
PsiElement(CLASS_BEGIN)('[')
RegExpCharImpl: <\udbff>
PsiElement(UNICODE_CHAR)('\udbff')
RegExpCharRangeImpl: <\udfff-\ud800>
RegExpCharImpl: <\udfff>
RegExpCharRangeImpl: <\udbff\udfff-\ud800\udc00>
RegExpCharImpl: <\udbff\udfff>
PsiElement(UNICODE_CHAR)('\udbff')
PsiElement(UNICODE_CHAR)('\udfff')
PsiElement(MINUS)('-')
RegExpCharImpl: <\ud800>
RegExpCharImpl: <\ud800\udc00>
PsiElement(UNICODE_CHAR)('\ud800')
RegExpCharImpl: <\udc00>
PsiElement(UNICODE_CHAR)('\udc00')
PsiElement(UNICODE_CHAR)('\udc00')
PsiElement(CLASS_END)(']')

View File

@@ -3,14 +3,12 @@ REGEXP_FILE
RegExpBranchImpl: <[\ud800\udc00-\udbff\udfff]>
RegExpClassImpl: <[\ud800\udc00-\udbff\udfff]>
PsiElement(CLASS_BEGIN)('[')
RegExpCharImpl: <\ud800>
PsiElement(UNICODE_CHAR)('\ud800')
RegExpCharRangeImpl: <\udc00-\udbff>
RegExpCharImpl: <\udc00>
RegExpCharRangeImpl: <\ud800\udc00-\udbff\udfff>
RegExpCharImpl: <\ud800\udc00>
PsiElement(UNICODE_CHAR)('\ud800')
PsiElement(UNICODE_CHAR)('\udc00')
PsiElement(MINUS)('-')
RegExpCharImpl: <\udbff>
RegExpCharImpl: <\udbff\udfff>
PsiElement(UNICODE_CHAR)('\udbff')
RegExpCharImpl: <\udfff>
PsiElement(UNICODE_CHAR)('\udfff')
PsiElement(UNICODE_CHAR)('\udfff')
PsiElement(CLASS_END)(']')