regex: fix handling of surrogate pairs in escaped unicode characters

GitOrigin-RevId: fac7582466fec09d569153bd0ee865821c82c41c
2026-03-22 15:19:59 +07:00 · 2020-02-26 18:45:36 +01:00
parent fb49c35cbb
commit 489e253dc5
6 changed files with 54 additions and 52 deletions
--- a/RegExpSupport/src/org/intellij/lang/regexp/RegExpParser.java
+++ b/RegExpSupport/src/org/intellij/lang/regexp/RegExpParser.java
@@ -21,6 +21,7 @@ import com.intellij.lang.PsiBuilder;
 import com.intellij.lang.PsiParser;
 import com.intellij.psi.tree.IElementType;
 import com.intellij.psi.tree.TokenSet;
+import org.intellij.lang.regexp.psi.impl.RegExpCharImpl;
 import org.jetbrains.annotations.NotNull;
 import org.jetbrains.annotations.Nullable;

@@ -526,6 +527,22 @@ public class RegExpParser implements PsiParser, LightPsiParser {
      checkMatches(builder, RegExpTT.RBRACE, "'}' expected");
      marker.done(RegExpElementTypes.NAMED_CHARACTER);
    }
+    else if (builder.getTokenType() == RegExpTT.UNICODE_CHAR) {
+      final String text1 = builder.getTokenText();
+      assert text1 != null;
+      final int value1 = RegExpCharImpl.unescapeChar(text1);
+      builder.advanceLexer();
+      // merge surrogate pairs into single regexp char
+      if (!Character.isSupplementaryCodePoint(value1) && Character.isHighSurrogate((char)value1)) {
+        final String text2 = builder.getTokenText();
+        assert text2 != null;
+        final int value2 = RegExpCharImpl.unescapeChar(text2);
+        if (!Character.isSupplementaryCodePoint(value2) && Character.isLowSurrogate((char)value2)) {
+          builder.advanceLexer();
+        }
+      }
+      marker.done(RegExpElementTypes.CHAR);
+    }
    else {
      builder.advanceLexer();
      marker.done(RegExpElementTypes.CHAR);
--- a/RegExpSupport/src/org/intellij/lang/regexp/psi/impl/RegExpCharImpl.java
+++ b/RegExpSupport/src/org/intellij/lang/regexp/psi/impl/RegExpCharImpl.java
@@ -57,12 +57,26 @@ public class RegExpCharImpl extends RegExpElementImpl implements RegExpChar {

    @Override
    public int getValue() {
-      final String s = getUnescapedText();
-      if (s.equals("\\") && getType() == Type.CHAR) return '\\';
-      return unescapeChar(s);
+        final ASTNode node = getNode();
+        final IElementType type = node.getFirstChildNode().getElementType();
+        if (type == RegExpTT.BAD_OCT_VALUE ||
+            type == RegExpTT.BAD_HEX_VALUE ||
+            type == RegExpTT.BAD_CHARACTER ||
+            type == StringEscapesTokenTypes.INVALID_UNICODE_ESCAPE_TOKEN) {
+            return -1;
+        }
+        final String text = getUnescapedText();
+        if (text.length() == 1 && type == RegExpTT.CHARACTER) {
+            return text.codePointAt(0);
+        }
+        else if (type == RegExpTT.UNICODE_CHAR) {
+            final int i = text.indexOf('\\', 1);
+            if (i >= 0) return Character.toCodePoint((char)unescapeChar(text.substring(0, i)), (char)unescapeChar(text.substring(i)));
+        }
+        return unescapeChar(text);
    }

-    private static int unescapeChar(String s) {
+    public static int unescapeChar(String s) {
        final int length = s.length();
        assert length > 0;

--- a/RegExpSupport/src/org/intellij/lang/regexp/validation/RegExpAnnotator.java
+++ b/RegExpSupport/src/org/intellij/lang/regexp/validation/RegExpAnnotator.java
@@ -91,39 +91,16 @@ public final class RegExpAnnotator extends RegExpElementVisitor implements Annot
    if (to == null) {
      return;
    }
-    int fromCodePoint = from.getValue();
-    int toCodePoint = to.getValue();
+    final int fromCodePoint = from.getValue();
+    final int toCodePoint = to.getValue();
    if (fromCodePoint == -1 || toCodePoint == -1) {
      return;
    }
-    int errorStart = range.getTextOffset();
-    int errorEnd = errorStart + range.getTextLength();
-    // \ud800\udc00-\udbff\udfff
-    if (!Character.isSupplementaryCodePoint(fromCodePoint) && Character.isLowSurrogate((char)fromCodePoint)) {
-      final PsiElement prevSibling = range.getPrevSibling();
-      if (prevSibling instanceof RegExpChar) {
-        final int prevSiblingValue = ((RegExpChar)prevSibling).getValue();
-        if (!Character.isSupplementaryCodePoint(prevSiblingValue) && Character.isHighSurrogate((char)prevSiblingValue)) {
-          fromCodePoint = Character.toCodePoint((char)prevSiblingValue, (char)fromCodePoint);
-          errorStart -= prevSibling.getTextLength();
-        }
-      }
-    }
-    if (!Character.isSupplementaryCodePoint(toCodePoint) && Character.isHighSurrogate((char)toCodePoint)) {
-      final PsiElement nextSibling = range.getNextSibling();
-      if (nextSibling instanceof RegExpChar) {
-        final int nextSiblingValue = ((RegExpChar)nextSibling).getValue();
-        if (!Character.isSupplementaryCodePoint(nextSiblingValue) && Character.isLowSurrogate((char)nextSiblingValue)) {
-          toCodePoint = Character.toCodePoint((char)toCodePoint, (char)nextSiblingValue);
-          errorEnd += nextSibling.getTextLength();
-        }
-      }
-    }
    if (toCodePoint < fromCodePoint) {
-      myHolder.createErrorAnnotation(new TextRange(errorStart, errorEnd), "Illegal character range (to < from)");
+      myHolder.newAnnotation(HighlightSeverity.ERROR, "Illegal character range (to < from)").range(range).create();
    }
    else if (toCodePoint == fromCodePoint) {
-      myHolder.createWarningAnnotation(new TextRange(errorStart, errorEnd), "Redundant character range");
+      myHolder.newAnnotation(HighlightSeverity.WARNING, "Redundant character range").range(range).create();
    }
  }

--- a/RegExpSupport/testData/psi/Charclasses64.txt
+++ b/RegExpSupport/testData/psi/Charclasses64.txt
@@ -3,14 +3,12 @@ REGEXP_FILE
    RegExpBranchImpl: <[\ud800\udc00-\udbff\udfff]>
      RegExpClassImpl: <[\ud800\udc00-\udbff\udfff]>
        PsiElement(CLASS_BEGIN)('[')
-        RegExpCharImpl: <\ud800>
-          PsiElement(UNICODE_CHAR)('\ud800')
-        RegExpCharRangeImpl: <\udc00-\udbff>
-          RegExpCharImpl: <\udc00>
+        RegExpCharRangeImpl: <\ud800\udc00-\udbff\udfff>
+          RegExpCharImpl: <\ud800\udc00>
+            PsiElement(UNICODE_CHAR)('\ud800')
            PsiElement(UNICODE_CHAR)('\udc00')
          PsiElement(MINUS)('-')
-          RegExpCharImpl: <\udbff>
+          RegExpCharImpl: <\udbff\udfff>
            PsiElement(UNICODE_CHAR)('\udbff')
-        RegExpCharImpl: <\udfff>
-          PsiElement(UNICODE_CHAR)('\udfff')
+            PsiElement(UNICODE_CHAR)('\udfff')
        PsiElement(CLASS_END)(']')
--- a/RegExpSupport/testData/psi/Escapes16.txt
+++ b/RegExpSupport/testData/psi/Escapes16.txt
@@ -3,14 +3,12 @@ REGEXP_FILE
    RegExpBranchImpl: <[\udbff\udfff-\ud800\udc00]>
      RegExpClassImpl: <[\udbff\udfff-\ud800\udc00]>
        PsiElement(CLASS_BEGIN)('[')
-        RegExpCharImpl: <\udbff>
-          PsiElement(UNICODE_CHAR)('\udbff')
-        RegExpCharRangeImpl: <\udfff-\ud800>
-          RegExpCharImpl: <\udfff>
+        RegExpCharRangeImpl: <\udbff\udfff-\ud800\udc00>
+          RegExpCharImpl: <\udbff\udfff>
+            PsiElement(UNICODE_CHAR)('\udbff')
            PsiElement(UNICODE_CHAR)('\udfff')
          PsiElement(MINUS)('-')
-          RegExpCharImpl: <\ud800>
+          RegExpCharImpl: <\ud800\udc00>
            PsiElement(UNICODE_CHAR)('\ud800')
-        RegExpCharImpl: <\udc00>
-          PsiElement(UNICODE_CHAR)('\udc00')
+            PsiElement(UNICODE_CHAR)('\udc00')
        PsiElement(CLASS_END)(']')
--- a/RegExpSupport/testData/psi/Escapes17.txt
+++ b/RegExpSupport/testData/psi/Escapes17.txt
@@ -3,14 +3,12 @@ REGEXP_FILE
    RegExpBranchImpl: <[\ud800\udc00-\udbff\udfff]>
      RegExpClassImpl: <[\ud800\udc00-\udbff\udfff]>
        PsiElement(CLASS_BEGIN)('[')
-        RegExpCharImpl: <\ud800>
-          PsiElement(UNICODE_CHAR)('\ud800')
-        RegExpCharRangeImpl: <\udc00-\udbff>
-          RegExpCharImpl: <\udc00>
+        RegExpCharRangeImpl: <\ud800\udc00-\udbff\udfff>
+          RegExpCharImpl: <\ud800\udc00>
+            PsiElement(UNICODE_CHAR)('\ud800')
            PsiElement(UNICODE_CHAR)('\udc00')
          PsiElement(MINUS)('-')
-          RegExpCharImpl: <\udbff>
+          RegExpCharImpl: <\udbff\udfff>
            PsiElement(UNICODE_CHAR)('\udbff')
-        RegExpCharImpl: <\udfff>
-          PsiElement(UNICODE_CHAR)('\udfff')
+            PsiElement(UNICODE_CHAR)('\udfff')
        PsiElement(CLASS_END)(']')