Files
openide/python/python-parser/src/com/jetbrains/python/lexer/PythonIndentingProcessor.java
Vladimir Koshelev 29f0eb6c77 [python] extract python parser to a separate module
Merge-request: IJ-MR-116296
Merged-by: Vladimir Koshelev <Vladimir.Koshelev@jetbrains.com>

GitOrigin-RevId: e7559fb3215d757e6273543e4aa27d52df755e63
2023-10-09 11:56:10 +00:00

507 lines
16 KiB
Java

// Copyright 2000-2023 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
package com.jetbrains.python.lexer;
import com.intellij.lexer.FlexAdapter;
import com.intellij.lexer.FlexLexer;
import com.intellij.lexer.MergingLexerAdapter;
import com.intellij.psi.tree.IElementType;
import com.intellij.psi.tree.TokenSet;
import com.intellij.util.containers.Stack;
import com.jetbrains.python.PyTokenTypes;
import com.jetbrains.python.PythonDialectsTokenSetProvider;
import com.jetbrains.python.psi.PyStringLiteralUtil;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.util.ArrayList;
import java.util.List;
public class PythonIndentingProcessor extends MergingLexerAdapter {
@SuppressWarnings("SSBasedInspection")
protected final IntArrayList myIndentStack = new IntArrayList();
protected int myBraceLevel;
protected boolean myLineHasSignificantTokens;
protected int myLastNewLineIndent = -1;
private int myCurrentNewLineIndent = 0;
protected List<PendingToken> myTokenQueue = new ArrayList<>();
private int myLineBreakBeforeFirstCommentIndex = -1;
protected boolean myProcessSpecialTokensPending = false;
private final Stack<FString> myFStringStack = new Stack<>();
private static final boolean DUMP_TOKENS = false;
private final TokenSet RECOVERY_TOKENS = PythonDialectsTokenSetProvider.getInstance().getUnbalancedBracesRecoveryTokens();
public PythonIndentingProcessor(FlexLexer lexer, TokenSet tokens) {
super(new FlexAdapter(lexer), tokens);
}
protected static class PendingToken {
private IElementType _type;
private final int _start;
private final int _end;
public PendingToken(IElementType type, int start, int end) {
_type = type;
_start = start;
_end = end;
}
public IElementType getType() {
return _type;
}
public int getStart() {
return _start;
}
public int getEnd() {
return _end;
}
public void setType(IElementType type) {
_type = type;
}
@Override
public String toString() {
return _type + ":" + _start + "-" + _end;
}
}
private static class PendingCommentToken extends PendingToken {
private final int myIndent;
PendingCommentToken(IElementType type, int start, int end, int indent) {
super(type, start, end);
myIndent = indent;
}
public int getIndent() {
return myIndent;
}
}
@Nullable
protected IElementType getBaseTokenType() {
return super.getTokenType();
}
protected int getBaseTokenStart() {
return super.getTokenStart();
}
protected int getBaseTokenEnd() {
return super.getTokenEnd();
}
@NotNull
protected String getBaseTokenText() {
return getBufferSequence().subSequence(getBaseTokenStart(), getBaseTokenEnd()).toString();
}
private boolean isBaseAt(IElementType tokenType) {
return getBaseTokenType() == tokenType;
}
@Override
public IElementType getTokenType() {
if (myTokenQueue.size() > 0) {
return myTokenQueue.get(0).getType();
}
return super.getTokenType();
}
@Override
public int getTokenStart() {
if (myTokenQueue.size() > 0) {
return myTokenQueue.get(0).getStart();
}
return super.getTokenStart();
}
@Override
public int getTokenEnd() {
if (myTokenQueue.size() > 0) {
return myTokenQueue.get(0).getEnd();
}
return super.getTokenEnd();
}
@Override
public void advance() {
if (getTokenType() == PyTokenTypes.LINE_BREAK) {
final String text = getTokenText();
int spaces = 0;
for (int i = text.length() - 1; i >= 0; i--) {
if (text.charAt(i) == ' ') {
spaces++;
}
else if (text.charAt(i) == '\t') {
spaces += 8;
}
}
myCurrentNewLineIndent = spaces;
}
else if (getTokenType() == PyTokenTypes.TAB) {
myCurrentNewLineIndent += 8;
}
if (myTokenQueue.size() > 0) {
myTokenQueue.remove(0);
if (myProcessSpecialTokensPending) {
myProcessSpecialTokensPending = false;
processSpecialTokens();
}
}
else {
advanceBase();
processSpecialTokens();
}
adjustBraceLevel();
if (DUMP_TOKENS) {
if (getTokenType() != null) {
System.out.print(getTokenStart() + "-" + getTokenEnd() + ":" + getTokenType());
if (getTokenType() == PyTokenTypes.LINE_BREAK) {
System.out.println("{" + myBraceLevel + "}");
}
else {
System.out.print(" ");
}
}
}
}
protected void advanceBase() {
super.advance();
checkSignificantTokens();
checkFString();
}
private void checkFString() {
final String tokenText = getBaseTokenText();
if (isBaseAt(PyTokenTypes.FSTRING_START)) {
final int prefixLength = PyStringLiteralUtil.getPrefixLength(tokenText);
final String openingQuotes = tokenText.substring(prefixLength);
assert !openingQuotes.isEmpty();
myFStringStack.push(new FString(openingQuotes, new Stack<>()));
}
else if (isBaseAt(PyTokenTypes.FSTRING_END)) {
while (!myFStringStack.isEmpty()) {
final FString lastFString = myFStringStack.pop();
if (lastFString.quotes.equals(tokenText)) {
break;
}
}
}
else if (isBaseAt(PyTokenTypes.FSTRING_FRAGMENT_START)) {
assert !myFStringStack.isEmpty();
myFStringStack.peek().fragments.push(FStringFragmentPart.EXPRESSION);
}
else if (isBaseAt(PyTokenTypes.FSTRING_FRAGMENT_END)) {
assert !myFStringStack.isEmpty();
FString topmostFString = myFStringStack.peek();
assert !topmostFString.fragments.isEmpty();
topmostFString.fragments.pop();
}
else if (isBaseAt(PyTokenTypes.FSTRING_FRAGMENT_FORMAT_START) || isBaseAt(PyTokenTypes.FSTRING_FRAGMENT_TYPE_CONVERSION)) {
assert !myFStringStack.isEmpty();
FString topmostFString = myFStringStack.peek();
assert !topmostFString.fragments.isEmpty();
topmostFString.fragments.pop();
topmostFString.fragments.push(FStringFragmentPart.TYPE_CONVERSION_OR_FORMAT);
}
}
protected void pushToken(IElementType type, int start, int end) {
myTokenQueue.add(new PendingToken(type, start, end));
}
@Override
public void start(@NotNull CharSequence buffer, int startOffset, int endOffset, int initialState) {
checkStartState(startOffset, initialState);
super.start(buffer, startOffset, endOffset, initialState);
setStartState();
}
protected void checkStartState(int startOffset, int initialState) {
if (DUMP_TOKENS) {
System.out.println("\n--- LEXER START---");
}
}
private void setStartState() {
myIndentStack.clear();
myIndentStack.push(0);
myBraceLevel = 0;
adjustBraceLevel();
myLineHasSignificantTokens = false;
checkSignificantTokens();
checkFString();
if (isBaseAt(PyTokenTypes.SPACE)) {
processIndent(0, PyTokenTypes.SPACE);
}
}
private void adjustBraceLevel() {
boolean insideFStringFragment = !myFStringStack.isEmpty() && !myFStringStack.peek().fragments.isEmpty();
final IElementType tokenType = getTokenType();
if (PyTokenTypes.OPEN_BRACES.contains(tokenType)) {
myBraceLevel++;
}
else if (PyTokenTypes.CLOSE_BRACES.contains(tokenType)) {
myBraceLevel--;
}
else if ((myBraceLevel != 0 || insideFStringFragment) && RECOVERY_TOKENS.contains(tokenType)) {
myBraceLevel = 0;
if (insideFStringFragment) {
myFStringStack.clear();
}
final int pos = getTokenStart();
pushToken(PyTokenTypes.STATEMENT_BREAK, pos, pos);
final int indents = myIndentStack.size();
for (int i = 0; i < indents - 1; i++) {
final int indent = myIndentStack.topInt();
if (myCurrentNewLineIndent >= indent) {
break;
}
if (myIndentStack.size() > 1) {
myIndentStack.pop();
pushToken(PyTokenTypes.DEDENT, pos, pos);
}
}
pushToken(PyTokenTypes.LINE_BREAK, pos, pos);
}
}
protected void checkSignificantTokens() {
IElementType tokenType = getBaseTokenType();
if (!PyTokenTypes.WHITESPACE_OR_LINEBREAK.contains(tokenType) && tokenType != getCommentTokenType()) {
myLineHasSignificantTokens = true;
}
}
protected void processSpecialTokens() {
int tokenStart = getBaseTokenStart();
if (isBaseAt(PyTokenTypes.LINE_BREAK)) {
processLineBreak(tokenStart);
if (isBaseAt(getCommentTokenType())) {
myLineBreakBeforeFirstCommentIndex = myTokenQueue.size() - 1;
while (isBaseAt(getCommentTokenType())) {
// comment at start of line; maybe we need to generate dedent before the comments
final int commentEnd = getBaseTokenEnd();
myTokenQueue.add(new PendingCommentToken(getBaseTokenType(), getBaseTokenStart(), commentEnd, myLastNewLineIndent));
advanceBase();
if (isBaseAt(PyTokenTypes.LINE_BREAK)) {
processLineBreak(getBaseTokenStart());
}
// Treat EOF as an indent of size 0
else if (getBaseTokenType() == null) {
closeDanglingSuitesWithComments(0, commentEnd);
}
else {
break;
}
}
myLineBreakBeforeFirstCommentIndex = -1;
}
}
else if (isBaseAt(PyTokenTypes.BACKSLASH)) {
processBackslash(tokenStart);
}
else if (isBaseAt(PyTokenTypes.SPACE)) {
processSpace();
}
}
private void processSpace() {
int start = getBaseTokenStart();
int end = getBaseTokenEnd();
while (getBaseTokenType() == PyTokenTypes.SPACE) {
end = getBaseTokenEnd();
advanceBase();
}
if (getBaseTokenType() == PyTokenTypes.LINE_BREAK) {
processLineBreak(start);
}
else if (getBaseTokenType() == PyTokenTypes.BACKSLASH) {
processBackslash(start);
}
else {
myTokenQueue.add(new PendingToken(PyTokenTypes.SPACE, start, end));
}
}
private void processBackslash(int tokenStart) {
PendingToken backslashToken = new PendingToken(getBaseTokenType(), tokenStart, getBaseTokenEnd());
myTokenQueue.add(backslashToken);
advanceBase();
while (PyTokenTypes.WHITESPACE.contains(getBaseTokenType())) {
pushCurrentToken();
advanceBase();
}
if (getBaseTokenType() == PyTokenTypes.LINE_BREAK) {
backslashToken.setType(PyTokenTypes.SPACE);
processInsignificantLineBreak(getBaseTokenStart(), true);
}
myProcessSpecialTokensPending = true;
}
protected void processLineBreak(int startPos) {
if (myBraceLevel == 0 && isOutsideFStringOrInsideItsLineBreakSensitiveTextPart()) {
if (myLineHasSignificantTokens) {
pushToken(PyTokenTypes.STATEMENT_BREAK, startPos, startPos);
}
myLineHasSignificantTokens = false;
advanceBase();
processIndent(startPos, PyTokenTypes.LINE_BREAK);
}
else {
processInsignificantLineBreak(startPos, false);
}
}
private boolean isOutsideFStringOrInsideItsLineBreakSensitiveTextPart() {
if (myFStringStack.isEmpty()) return true;
FString topmostFString = myFStringStack.peek();
// In triple-quoted f-strings one can put line breaks in any plain-text part
if (topmostFString.quotes.length() != 1) return false;
return topmostFString.fragments.isEmpty() || topmostFString.fragments.peek() == FStringFragmentPart.TYPE_CONVERSION_OR_FORMAT;
}
protected void processInsignificantLineBreak(int startPos,
boolean breakStatementOnLineBreak) {
// merge whitespace following the line break character into the
// line break token
int end = getBaseTokenEnd();
advanceBase();
while (getBaseTokenType() == PyTokenTypes.SPACE || getBaseTokenType() == PyTokenTypes.TAB ||
(!breakStatementOnLineBreak && getBaseTokenType() == PyTokenTypes.LINE_BREAK)) {
end = getBaseTokenEnd();
advanceBase();
}
myTokenQueue.add(new PendingToken(PyTokenTypes.LINE_BREAK, startPos, end));
myProcessSpecialTokensPending = true;
}
protected void processIndent(int whiteSpaceStart, IElementType whitespaceTokenType) {
int lastIndent = myIndentStack.topInt();
int indent = getNextLineIndent();
myLastNewLineIndent = indent;
// don't generate indent/dedent tokens if a line contains only end-of-line comment and whitespace
if (getBaseTokenType() == getCommentTokenType()) {
indent = lastIndent;
}
int whiteSpaceEnd = (getBaseTokenType() == null) ? super.getBufferEnd() : getBaseTokenStart();
if (indent > lastIndent) {
myIndentStack.push(indent);
myTokenQueue.add(new PendingToken(whitespaceTokenType, whiteSpaceStart, whiteSpaceEnd));
int insertIndex = skipPrecedingCommentsWithIndent(indent, myTokenQueue.size() - 1);
int indentOffset = insertIndex == myTokenQueue.size() ? whiteSpaceEnd : myTokenQueue.get(insertIndex).getStart();
myTokenQueue.add(insertIndex, new PendingToken(PyTokenTypes.INDENT, indentOffset, indentOffset));
}
else if (indent < lastIndent) {
closeDanglingSuitesWithComments(indent, whiteSpaceStart);
myTokenQueue.add(new PendingToken(whitespaceTokenType, whiteSpaceStart, whiteSpaceEnd));
}
else {
myTokenQueue.add(new PendingToken(whitespaceTokenType, whiteSpaceStart, whiteSpaceEnd));
}
}
private void closeDanglingSuitesWithComments(int indent, int whiteSpaceStart) {
int lastIndent = myIndentStack.topInt();
int insertIndex = myLineBreakBeforeFirstCommentIndex == -1 ? myTokenQueue.size() : myLineBreakBeforeFirstCommentIndex;
int lastSuiteIndent;
while (indent < lastIndent) {
lastSuiteIndent = myIndentStack.popInt();
lastIndent = myIndentStack.topInt();
int dedentOffset = whiteSpaceStart;
if (indent > lastIndent) {
myTokenQueue.add(new PendingToken(PyTokenTypes.INCONSISTENT_DEDENT, whiteSpaceStart, whiteSpaceStart));
insertIndex = myTokenQueue.size();
}
else {
insertIndex = skipPrecedingCommentsWithSameIndentOnSuiteClose(lastSuiteIndent, insertIndex);
}
if (insertIndex != myTokenQueue.size()) {
dedentOffset = myTokenQueue.get(insertIndex).getStart();
}
myTokenQueue.add(insertIndex, new PendingToken(PyTokenTypes.DEDENT, dedentOffset, dedentOffset));
insertIndex++;
}
}
protected int skipPrecedingCommentsWithIndent(int indent, int index) {
// insert the DEDENT before previous comments that have the same indent as the current token indent
boolean foundComment = false;
while(index > 0 && myTokenQueue.get(index - 1) instanceof PendingCommentToken commentToken) {
if (commentToken.getIndent() != indent) {
break;
}
foundComment = true;
index--;
if (index > 1 &&
myTokenQueue.get(index - 1).getType() == PyTokenTypes.LINE_BREAK &&
myTokenQueue.get(index - 2) instanceof PendingCommentToken) {
index--;
}
}
return foundComment ? index : myTokenQueue.size();
}
protected int skipPrecedingCommentsWithSameIndentOnSuiteClose(int indent, int anchorIndex) {
int result = anchorIndex;
for (int i = anchorIndex; i < myTokenQueue.size(); i++) {
final PendingToken token = myTokenQueue.get(i);
if (token instanceof PendingCommentToken) {
if (((PendingCommentToken)token).getIndent() < indent) {
break;
}
result = i + 1;
}
}
return result;
}
protected int getNextLineIndent() {
int indent = 0;
while (getBaseTokenType() != null && PyTokenTypes.WHITESPACE_OR_LINEBREAK.contains(getBaseTokenType())) {
if (getBaseTokenType() == PyTokenTypes.TAB) {
indent = ((indent / 8) + 1) * 8;
}
else if (getBaseTokenType() == PyTokenTypes.SPACE) {
indent++;
}
else if (getBaseTokenType() == PyTokenTypes.LINE_BREAK) {
indent = 0;
}
advanceBase();
}
if (getBaseTokenType() == null) {
return 0;
}
return indent;
}
private void pushCurrentToken() {
myTokenQueue.add(new PendingToken(getBaseTokenType(), getBaseTokenStart(), getBaseTokenEnd()));
}
protected IElementType getCommentTokenType() {
return PyTokenTypes.END_OF_LINE_COMMENT;
}
private record FString(@NotNull String quotes, @NotNull Stack<FStringFragmentPart> fragments) {
}
private enum FStringFragmentPart {
EXPRESSION,
TYPE_CONVERSION_OR_FORMAT,
}
}