Files
openide/python/helpers/pycharm_generator_utils/docstring_parsing.py
Mikhail Golubev 91c41bc40a Don't initialize docstring parser when only copying skeletons from the cache
Parser initialization is moved out of constants.py modules.
generator3.py imports modules_redeclarator.py only when it's needed.

Profiling results show that it takes about 1/4 of the total running time
in scenarios when generator3 only copies existing binary stubs.

GitOrigin-RevId: 8fa303e8449cde0ddd7bcb1241a17bb4ef91ad84
2019-08-05 20:04:17 +03:00

213 lines
8.2 KiB
Python

import re
import sys
from pycharm_generator_utils.constants import STR_TYPES
from pycharm_generator_utils.util_methods import sanitize_value
from six import PY2
if PY2:
from pycharm_generator_utils.pyparsing_py2 import *
else:
# noinspection PyUnresolvedReferences
from pycharm_generator_utils.pyparsing_py3 import *
# grammar to parse parameter lists
# // snatched from parsePythonValue.py, from pyparsing samples, copyright 2006 by Paul McGuire but under BSD license.
# we don't suppress lots of punctuation because we want it back when we reconstruct the lists
lparen, rparen, lbrack, rbrack, lbrace, rbrace, colon = map(Literal, "()[]{}:")
integer = Combine(Optional(oneOf("+ -")) + Word(nums)).setName("integer")
real = Combine(Optional(oneOf("+ -")) + Word(nums) + "." +
Optional(Word(nums)) +
Optional(oneOf("e E") + Optional(oneOf("+ -")) + Word(nums))).setName("real")
tupleStr = Forward()
listStr = Forward()
dictStr = Forward()
boolLiteral = oneOf("True False")
noneLiteral = Literal("None")
listItem = real | integer | quotedString | unicodeString | boolLiteral | noneLiteral | \
Group(listStr) | tupleStr | dictStr
tupleStr << (Suppress("(") + Optional(delimitedList(listItem)) +
Optional(Literal(",")) + Suppress(")")).setResultsName("tuple")
listStr << (lbrack + Optional(delimitedList(listItem) +
Optional(Literal(","))) + rbrack).setResultsName("list")
dictEntry = Group(listItem + colon + listItem)
dictStr << (lbrace + Optional(delimitedList(dictEntry) + Optional(Literal(","))) + rbrace).setResultsName("dict")
# \\ end of the snatched part
# our output format is s-expressions:
# (simple name optional_value) is name or name=value
# (nested (simple ...) (simple ...)) is (name, name,...)
# (opt ...) is [, ...] or suchlike.
T_SIMPLE = 'Simple'
T_NESTED = 'Nested'
T_OPTIONAL = 'Opt'
T_RETURN = "Ret"
TRIPLE_DOT = '...'
COMMA = Suppress(",")
APOS = Suppress("'")
QUOTE = Suppress('"')
SP = Suppress(Optional(White()))
ident = Word(alphas + "_", alphanums + "_-.").setName("ident") # we accept things like "foo-or-bar"
decorated_ident = ident + Optional(Suppress(SP + Literal(":") + SP + ident)) # accept "foo: bar", ignore "bar"
spaced_ident = Combine(
decorated_ident + ZeroOrMore(Literal(' ') + decorated_ident)) # we accept 'list or tuple' or 'C struct'
# allow quoted names, because __setattr__, etc docs use it
paramname = spaced_ident | \
APOS + spaced_ident + APOS | \
QUOTE + spaced_ident + QUOTE
parenthesized_tuple = (Literal("(") + Optional(delimitedList(listItem, combine=True)) +
Optional(Literal(",")) + Literal(")")).setResultsName("(tuple)")
initializer = (SP + Suppress("=") + SP + Combine(parenthesized_tuple | listItem | ident)).setName(
"=init") # accept foo=defaultfoo
param = Group(Empty().setParseAction(replaceWith(T_SIMPLE)) + Combine(Optional(oneOf("* **")) + paramname) + Optional(
initializer))
ellipsis = Group(
Empty().setParseAction(replaceWith(T_SIMPLE)) + \
(Literal("..") +
ZeroOrMore(Literal('.'))).setParseAction(replaceWith(TRIPLE_DOT)) # we want to accept both 'foo,..' and 'foo, ...'
)
paramSlot = Forward()
simpleParamSeq = ZeroOrMore(paramSlot + COMMA) + Optional(paramSlot + Optional(COMMA))
nestedParamSeq = Group(
Suppress('(').setParseAction(replaceWith(T_NESTED)) + \
simpleParamSeq + Optional(ellipsis + Optional(COMMA) + Optional(simpleParamSeq)) + \
Suppress(')')
) # we accept "(a1, ... an)"
paramSlot << (param | nestedParamSeq)
optionalPart = Forward()
paramSeq = simpleParamSeq + Optional(optionalPart) # this is our approximate target
optionalPart << (
Group(
Suppress('[').setParseAction(replaceWith(T_OPTIONAL)) + Optional(COMMA) +
paramSeq + Optional(ellipsis) +
Suppress(']')
)
| ellipsis
)
return_type = Group(
Empty().setParseAction(replaceWith(T_RETURN)) +
Suppress(SP + (Literal("->") | (Literal(":") + SP + Literal("return"))) + SP) +
ident
)
# this is our ideal target, with balancing paren and a multiline rest of doc.
paramSeqAndRest = paramSeq + Suppress(')') + Optional(return_type) + Suppress(Optional(Regex(".*(?s)")))
def transform_seq(results, toplevel=True):
"""Transforms a tree of ParseResults into a param spec string."""
is_clr = sys.platform == "cli"
ret = [] # add here token to join
for token in results:
token_type = token[0]
if token_type is T_SIMPLE:
token_name = token[1]
if len(token) == 3: # name with value
if toplevel:
ret.append(sanitize_ident(token_name, is_clr) + "=" + sanitize_value(token[2]))
else:
# smth like "a, (b1=1, b2=2)", make it "a, p_b"
return ["p_" + results[0][1]] # NOTE: for each item of tuple, return the same name of its 1st item.
elif token_name == TRIPLE_DOT:
if toplevel and not has_item_starting_with(ret, "*"):
ret.append("*more")
else:
# we're in a "foo, (bar1, bar2, ...)"; make it "foo, bar_tuple"
return extract_alpha_prefix(results[0][1]) + "_tuple"
else: # just name
ret.append(sanitize_ident(token_name, is_clr))
elif token_type is T_NESTED:
inner = transform_seq(token[1:], False)
if len(inner) != 1:
ret.append(inner)
else:
ret.append(inner[0]) # [foo] -> foo
elif token_type is T_OPTIONAL:
ret.extend(transform_optional_seq(token))
elif token_type is T_RETURN:
pass # this is handled elsewhere
else:
raise Exception("This cannot be a token type: " + repr(token_type))
return ret
def transform_optional_seq(results):
"""
Produces a string that describes the optional part of parameters.
@param results must start from T_OPTIONAL.
"""
assert results[0] is T_OPTIONAL, "transform_optional_seq expects a T_OPTIONAL node, sees " + \
repr(results[0])
is_clr = sys.platform == "cli"
ret = []
for token in results[1:]:
token_type = token[0]
if token_type is T_SIMPLE:
token_name = token[1]
if len(token) == 3: # name with value; little sense, but can happen in a deeply nested optional
ret.append(sanitize_ident(token_name, is_clr) + "=" + sanitize_value(token[2]))
elif token_name == '...':
# we're in a "foo, [bar, ...]"; make it "foo, *bar"
return ["*" + extract_alpha_prefix(
results[1][1])] # we must return a seq; [1] is first simple, [1][1] is its name
else: # just name
ret.append(sanitize_ident(token_name, is_clr) + "=None")
elif token_type is T_OPTIONAL:
ret.extend(transform_optional_seq(token))
# maybe handle T_NESTED if such cases ever occur in real life
# it can't be nested in a sane case, really
return ret
def has_item_starting_with(p_seq, p_start):
for item in p_seq:
if isinstance(item, STR_TYPES) and item.startswith(p_start):
return True
return False
def sanitize_ident(x, is_clr=False):
"""Takes an identifier and returns it sanitized"""
if x in ("class", "object", "def", "list", "tuple", "int", "float", "str", "unicode" "None"):
return "p_" + x
else:
if is_clr:
# it tends to have names like "int x", turn it to just x
xs = x.split(" ")
if len(xs) == 2:
return sanitize_ident(xs[1])
return x.replace("-", "_").replace(" ", "_").replace(".", "_") # for things like "list-or-tuple" or "list or tuple"
def extract_alpha_prefix(p_string, default_prefix="some"):
"""Returns 'foo' for things like 'foo1' or 'foo2'; if prefix cannot be found, the default is returned"""
match = NUM_IDENT_PATTERN.match(p_string)
prefix = match and match.groups()[match.lastindex - 1] or None
return prefix or default_prefix
NUM_IDENT_PATTERN = re.compile("([A-Za-z_]+)[0-9]?[A-Za-z_]*") # 'foo_123' -> $1 = 'foo_'