[Grazie][IDEA-243259] Do not split words by right single quotation mark

GitOrigin-RevId: 7736d969c8dafc8ce180a9ebf9e8fe4536d2dba3
This commit is contained in:
Ekaterina Berezhko
2024-03-29 10:52:08 +02:00
committed by intellij-monorepo-bot
parent c87ed9a7b7
commit 2d3d4c8470
3 changed files with 8 additions and 3 deletions

View File

@@ -102,7 +102,8 @@ public class IdentifierSplitter extends BaseSplitter {
type == Character.OTHER_LETTER ||
type == Character.MODIFIER_LETTER ||
type == Character.NON_SPACING_MARK ||
type == Character.OTHER_PUNCTUATION
type == Character.OTHER_PUNCTUATION ||
ch == '\u2019' // right single quotation mark
) {
//letter
if (s < 0) {

View File

@@ -21,13 +21,15 @@ public class TextSplitter extends BaseSplitter {
private static final String letter = "(\\p{L}\\p{Mn}*)";
private static final String xmlEntity = "(&.+?;)";
private static final String rightSingleQuotationMark = "\\u2019";
// using possessive quantifiers ++ and *+ to avoid SOE on large inputs
// see https://blog.sonarsource.com/crafting-regexes-to-avoid-stack-overflows/
private static final Pattern EXTENDED_WORD_AND_SPECIAL = Pattern.compile(
xmlEntity + "|" +
"(#|0x\\d*)?" + // an optional prefix
letter + "++" + // some letters
"('" + letter + ")?" + // if there's an apostrophe, it should be followed by a letter
"(['" + rightSingleQuotationMark + "]" + letter + ")?" + // if there's an apostrophe, it should be followed by a letter
"(_|" + letter + ")*+" // more letters and underscores
);
@Override