Files
openide/python/helpers/pydev/_pydevd_bundle/tables/pydevd_pandas.py
Natalia.Murycheva 0656b7db65 DS-6230 Statistics Computations improvements
Improved performance for data computation for visualization for pandas for numeric types.
Added restriction for the number of rows for data for which statistics will be computed (as the describe function takes too much time for data frames with such size or bigger)

GitOrigin-RevId: 507ccfe63fb77b860febf94fe9584e0d7025a10f
2024-02-21 18:36:42 +00:00

260 lines
9.0 KiB
Python

# Copyright 2000-2023 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
import numpy as np
import pandas as pd
import typing
TABLE_TYPE_NEXT_VALUE_SEPARATOR = '__pydev_table_column_type_val__'
MAX_COLWIDTH_PYTHON_2 = 100000
def get_type(table):
# type: (str) -> str
return str(type(table))
# noinspection PyUnresolvedReferences
def get_shape(table):
# type: (Union[pd.DataFrame, pd.Series]) -> str
return str(table.shape[0])
# noinspection PyUnresolvedReferences
def get_head(table):
# type: (Union[pd.DataFrame, pd.Series]) -> str
return repr(__convert_to_df(table).head().to_html(notebook=True, max_cols=None))
# noinspection PyUnresolvedReferences
def get_column_types(table):
# type: (Union[pd.DataFrame, pd.Series]) -> str
table = __convert_to_df(table)
return str(table.index.dtype) + TABLE_TYPE_NEXT_VALUE_SEPARATOR + \
TABLE_TYPE_NEXT_VALUE_SEPARATOR.join([str(t) for t in table.dtypes])
# used by pydevd
# noinspection PyUnresolvedReferences
def get_data(table, start_index=None, end_index=None):
# type: (Union[pd.DataFrame, pd.Series], int, int) -> str
def convert_data_to_html(data, max_cols):
return repr(__convert_to_df(data).to_html(notebook=True, max_cols=max_cols))
return _compute_sliced_data(table, convert_data_to_html, start_index, end_index)
# used by DSTableCommands
# noinspection PyUnresolvedReferences
def display_data(table, start_index, end_index):
# type: (Union[pd.DataFrame, pd.Series], int, int) -> None
def ipython_display(data, max_cols):
from IPython.display import display
display(__convert_to_df(data))
_compute_sliced_data(table, ipython_display, start_index, end_index)
def __get_data_slice(table, start, end):
return __convert_to_df(table).iloc[start:end]
def _compute_sliced_data(table, fun, start_index=None, end_index=None):
# type: (Union[pd.DataFrame, pd.Series], function, int, int) -> str
max_cols, max_colwidth = __get_tables_display_options()
_jb_max_cols = pd.get_option('display.max_columns')
_jb_max_colwidth = pd.get_option('display.max_colwidth')
pd.set_option('display.max_columns', max_cols)
pd.set_option('display.max_colwidth', max_colwidth)
if start_index is not None and end_index is not None:
table = __get_data_slice(table, start_index, end_index)
data = fun(table, max_cols)
pd.set_option('display.max_columns', _jb_max_cols)
pd.set_option('display.max_colwidth', _jb_max_colwidth)
return data
def get_column_descriptions(table):
# type: (Union[pd.DataFrame, pd.Series]) -> str
described_result = __get_describe(table)
if described_result is not None:
return get_data(described_result, None, None)
else:
return ""
def get_value_counts(table):
# type: (Union[pd.DataFrame, pd.Series]) -> str
counts_result = __get_counts(table)
return get_data(counts_result, None, None)
def __get_describe(table):
# type: (Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series, None]
try:
described_ = table.describe(percentiles=[.05, .25, .5, .75, .95],
exclude=[np.complex64, np.complex128])
except (TypeError, OverflowError, ValueError):
return
if type(table) is pd.Series:
return described_
else:
return described_.reindex(columns=table.columns, copy=False)
def __get_counts(table):
# type: (Union[pd.DataFrame, pd.Series]) -> pd.DataFrame
return __convert_to_df(table).count().to_frame().transpose()
class ColumnVisualisationType:
HISTOGRAM = "histogram"
UNIQUE = "unique"
PERCENTAGE = "percentage"
class ColumnVisualisationUtils:
NUM_BINS = 20
MAX_UNIQUE_VALUES_TO_SHOW_IN_VIS = 3
UNIQUE_VALUES_PERCENT = 50
TABLE_OCCURRENCES_COUNT_NEXT_COLUMN_SEPARATOR = '__pydev_table_occurrences_count_next_column__'
TABLE_OCCURRENCES_COUNT_NEXT_VALUE_SEPARATOR = '__pydev_table_occurrences_count_next_value__'
TABLE_OCCURRENCES_COUNT_DICT_SEPARATOR = '__pydev_table_occurrences_count_dict__'
TABLE_OCCURRENCES_COUNT_OTHER = '__pydev_table_other__'
def get_value_occurrences_count(table):
df = __convert_to_df(table)
bin_counts = []
for col_name in df.columns:
column_visualisation_type, result = analyze_column(df[col_name])
bin_counts.append(str({column_visualisation_type:result}))
return ColumnVisualisationUtils.TABLE_OCCURRENCES_COUNT_NEXT_COLUMN_SEPARATOR.join(bin_counts)
def analyze_column(column):
col_type = column.dtype
if col_type == bool:
return ColumnVisualisationType.HISTOGRAM, analyze_boolean_column(column)
elif col_type.kind in ['O', 'S', 'U', 'M', 'm', 'c'] or column.isna().all():
return analyze_categorical_column(column)
elif col_type.kind in ['i', 'f', 'u']:
return ColumnVisualisationType.HISTOGRAM, analyze_numeric_column(column)
def analyze_boolean_column(column):
res = column.value_counts().sort_index().to_dict()
return add_custom_key_value_separator(res.items())
def analyze_categorical_column(column):
# Processing of unhashable types (lists, dicts, etc.).
# In Polars these types are NESTED and can be processed separately, but in Pandas they are Objects
if len(column) == 0 or not isinstance(column.iloc[0], typing.Hashable):
return None, "{}"
value_counts = column.value_counts(dropna=False)
all_values = len(column)
vis_type = ColumnVisualisationType.PERCENTAGE
if len(value_counts) <= 3 or len(value_counts) / all_values * 100 <= ColumnVisualisationUtils.UNIQUE_VALUES_PERCENT:
# If column contains <= 3 unique values no `Other` category is shown, but all of these values and their percentages
num_unique_values_to_show_in_vis = ColumnVisualisationUtils.MAX_UNIQUE_VALUES_TO_SHOW_IN_VIS - (0 if len(value_counts) == 3 else 1)
top_values = value_counts.iloc[:num_unique_values_to_show_in_vis].apply(lambda count: round(count / all_values * 100, 1)).to_dict()
if len(value_counts) == 3:
top_values[ColumnVisualisationUtils.TABLE_OCCURRENCES_COUNT_OTHER] = -1
else:
others_count = value_counts.iloc[num_unique_values_to_show_in_vis:].sum()
top_values[ColumnVisualisationUtils.TABLE_OCCURRENCES_COUNT_OTHER] = round(others_count / all_values * 100, 1)
result = add_custom_key_value_separator(top_values.items())
else:
vis_type = ColumnVisualisationType.UNIQUE
top_values = len(value_counts)
result = top_values
return vis_type, result
def analyze_numeric_column(column):
# todo: add tests for that
if column.dtype.kind in ['i', 'u']:
bins = np.bincount(column)
unique_values = np.count_nonzero(bins)
else:
# for float type we don't compute number of unique values because it's an
# expensive operation, just take number of elements in a column
unique_values = column.size
if unique_values <= ColumnVisualisationUtils.NUM_BINS:
res = column.value_counts().sort_index().to_dict()
else:
format_function = int if column.dtype.kind == 'i' else lambda x: round(x, 1)
counts, bin_edges = np.histogram(column.dropna(), bins=ColumnVisualisationUtils.NUM_BINS)
# so the long dash will be correctly viewed both on Mac and Windows
bin_labels = ['{} \u2014 {}'.format(format_function(bin_edges[i]), format_function(bin_edges[i+1])) for i in range(ColumnVisualisationUtils.NUM_BINS)]
bin_count_dict = {label: count for label, count in zip(bin_labels, counts)}
res = bin_count_dict
return add_custom_key_value_separator(res.items())
def add_custom_key_value_separator(pairs_list):
return ColumnVisualisationUtils.TABLE_OCCURRENCES_COUNT_NEXT_VALUE_SEPARATOR.join(
['{}{}{}'.format(key, ColumnVisualisationUtils.TABLE_OCCURRENCES_COUNT_DICT_SEPARATOR, value) for key, value in pairs_list]
)
# noinspection PyUnresolvedReferences
def __convert_to_df(table):
# type: (Union[pd.DataFrame, pd.Series, pd.Categorical]) -> pd.DataFrame
if type(table) is pd.Series:
return __series_to_df(table)
if type(table) is pd.Categorical:
return __categorical_to_df(table)
return table
# pandas.Series support
def __get_column_name(table):
# type: (pd.Series) -> str
if table.name is not None:
# noinspection PyTypeChecker
return table.name
return '<unnamed>'
def __series_to_df(table):
# type: (pd.Series) -> pd.DataFrame
return table.to_frame(name=__get_column_name(table))
# numpy.array support
def __array_to_df(table):
# type: (np.ndarray) -> pd.DataFrame
return pd.DataFrame(table)
def __categorical_to_df(table):
# type: (pd.Categorical) -> pd.DataFrame
return pd.DataFrame(table)
# In old versions of pandas max_colwidth accepted only Int-s
def __get_tables_display_options():
# type: () -> Tuple[None, Union[int, None]]
import sys
if sys.version_info < (3, 0):
return None, MAX_COLWIDTH_PYTHON_2
return None, None