mirror of
https://gitflic.ru/project/openide/openide.git
synced 2025-12-17 07:20:53 +07:00
Improved performance for data computation for visualization for pandas for numeric types. Added restriction for the number of rows for data for which statistics will be computed (as the describe function takes too much time for data frames with such size or bigger) GitOrigin-RevId: 507ccfe63fb77b860febf94fe9584e0d7025a10f
260 lines
9.0 KiB
Python
260 lines
9.0 KiB
Python
# Copyright 2000-2023 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
|
|
import numpy as np
|
|
import pandas as pd
|
|
import typing
|
|
|
|
TABLE_TYPE_NEXT_VALUE_SEPARATOR = '__pydev_table_column_type_val__'
|
|
MAX_COLWIDTH_PYTHON_2 = 100000
|
|
|
|
|
|
def get_type(table):
|
|
# type: (str) -> str
|
|
return str(type(table))
|
|
|
|
|
|
# noinspection PyUnresolvedReferences
|
|
def get_shape(table):
|
|
# type: (Union[pd.DataFrame, pd.Series]) -> str
|
|
return str(table.shape[0])
|
|
|
|
|
|
# noinspection PyUnresolvedReferences
|
|
def get_head(table):
|
|
# type: (Union[pd.DataFrame, pd.Series]) -> str
|
|
return repr(__convert_to_df(table).head().to_html(notebook=True, max_cols=None))
|
|
|
|
|
|
# noinspection PyUnresolvedReferences
|
|
def get_column_types(table):
|
|
# type: (Union[pd.DataFrame, pd.Series]) -> str
|
|
table = __convert_to_df(table)
|
|
return str(table.index.dtype) + TABLE_TYPE_NEXT_VALUE_SEPARATOR + \
|
|
TABLE_TYPE_NEXT_VALUE_SEPARATOR.join([str(t) for t in table.dtypes])
|
|
|
|
|
|
# used by pydevd
|
|
# noinspection PyUnresolvedReferences
|
|
def get_data(table, start_index=None, end_index=None):
|
|
# type: (Union[pd.DataFrame, pd.Series], int, int) -> str
|
|
|
|
def convert_data_to_html(data, max_cols):
|
|
return repr(__convert_to_df(data).to_html(notebook=True, max_cols=max_cols))
|
|
|
|
return _compute_sliced_data(table, convert_data_to_html, start_index, end_index)
|
|
|
|
|
|
# used by DSTableCommands
|
|
# noinspection PyUnresolvedReferences
|
|
def display_data(table, start_index, end_index):
|
|
# type: (Union[pd.DataFrame, pd.Series], int, int) -> None
|
|
def ipython_display(data, max_cols):
|
|
from IPython.display import display
|
|
display(__convert_to_df(data))
|
|
|
|
_compute_sliced_data(table, ipython_display, start_index, end_index)
|
|
|
|
|
|
def __get_data_slice(table, start, end):
|
|
return __convert_to_df(table).iloc[start:end]
|
|
|
|
|
|
def _compute_sliced_data(table, fun, start_index=None, end_index=None):
|
|
# type: (Union[pd.DataFrame, pd.Series], function, int, int) -> str
|
|
|
|
max_cols, max_colwidth = __get_tables_display_options()
|
|
|
|
_jb_max_cols = pd.get_option('display.max_columns')
|
|
_jb_max_colwidth = pd.get_option('display.max_colwidth')
|
|
|
|
pd.set_option('display.max_columns', max_cols)
|
|
pd.set_option('display.max_colwidth', max_colwidth)
|
|
|
|
if start_index is not None and end_index is not None:
|
|
table = __get_data_slice(table, start_index, end_index)
|
|
|
|
data = fun(table, max_cols)
|
|
|
|
pd.set_option('display.max_columns', _jb_max_cols)
|
|
pd.set_option('display.max_colwidth', _jb_max_colwidth)
|
|
|
|
return data
|
|
|
|
|
|
def get_column_descriptions(table):
|
|
# type: (Union[pd.DataFrame, pd.Series]) -> str
|
|
described_result = __get_describe(table)
|
|
|
|
if described_result is not None:
|
|
return get_data(described_result, None, None)
|
|
else:
|
|
return ""
|
|
|
|
|
|
def get_value_counts(table):
|
|
# type: (Union[pd.DataFrame, pd.Series]) -> str
|
|
counts_result = __get_counts(table)
|
|
|
|
return get_data(counts_result, None, None)
|
|
|
|
|
|
def __get_describe(table):
|
|
# type: (Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series, None]
|
|
try:
|
|
described_ = table.describe(percentiles=[.05, .25, .5, .75, .95],
|
|
exclude=[np.complex64, np.complex128])
|
|
except (TypeError, OverflowError, ValueError):
|
|
return
|
|
|
|
if type(table) is pd.Series:
|
|
return described_
|
|
else:
|
|
return described_.reindex(columns=table.columns, copy=False)
|
|
|
|
|
|
def __get_counts(table):
|
|
# type: (Union[pd.DataFrame, pd.Series]) -> pd.DataFrame
|
|
return __convert_to_df(table).count().to_frame().transpose()
|
|
|
|
|
|
class ColumnVisualisationType:
|
|
HISTOGRAM = "histogram"
|
|
UNIQUE = "unique"
|
|
PERCENTAGE = "percentage"
|
|
|
|
|
|
class ColumnVisualisationUtils:
|
|
NUM_BINS = 20
|
|
MAX_UNIQUE_VALUES_TO_SHOW_IN_VIS = 3
|
|
UNIQUE_VALUES_PERCENT = 50
|
|
|
|
TABLE_OCCURRENCES_COUNT_NEXT_COLUMN_SEPARATOR = '__pydev_table_occurrences_count_next_column__'
|
|
TABLE_OCCURRENCES_COUNT_NEXT_VALUE_SEPARATOR = '__pydev_table_occurrences_count_next_value__'
|
|
TABLE_OCCURRENCES_COUNT_DICT_SEPARATOR = '__pydev_table_occurrences_count_dict__'
|
|
TABLE_OCCURRENCES_COUNT_OTHER = '__pydev_table_other__'
|
|
|
|
|
|
def get_value_occurrences_count(table):
|
|
df = __convert_to_df(table)
|
|
bin_counts = []
|
|
|
|
for col_name in df.columns:
|
|
column_visualisation_type, result = analyze_column(df[col_name])
|
|
|
|
bin_counts.append(str({column_visualisation_type:result}))
|
|
return ColumnVisualisationUtils.TABLE_OCCURRENCES_COUNT_NEXT_COLUMN_SEPARATOR.join(bin_counts)
|
|
|
|
|
|
def analyze_column(column):
|
|
col_type = column.dtype
|
|
|
|
if col_type == bool:
|
|
return ColumnVisualisationType.HISTOGRAM, analyze_boolean_column(column)
|
|
elif col_type.kind in ['O', 'S', 'U', 'M', 'm', 'c'] or column.isna().all():
|
|
return analyze_categorical_column(column)
|
|
elif col_type.kind in ['i', 'f', 'u']:
|
|
return ColumnVisualisationType.HISTOGRAM, analyze_numeric_column(column)
|
|
|
|
|
|
def analyze_boolean_column(column):
|
|
res = column.value_counts().sort_index().to_dict()
|
|
return add_custom_key_value_separator(res.items())
|
|
|
|
|
|
def analyze_categorical_column(column):
|
|
# Processing of unhashable types (lists, dicts, etc.).
|
|
# In Polars these types are NESTED and can be processed separately, but in Pandas they are Objects
|
|
if len(column) == 0 or not isinstance(column.iloc[0], typing.Hashable):
|
|
return None, "{}"
|
|
|
|
value_counts = column.value_counts(dropna=False)
|
|
all_values = len(column)
|
|
vis_type = ColumnVisualisationType.PERCENTAGE
|
|
if len(value_counts) <= 3 or len(value_counts) / all_values * 100 <= ColumnVisualisationUtils.UNIQUE_VALUES_PERCENT:
|
|
# If column contains <= 3 unique values no `Other` category is shown, but all of these values and their percentages
|
|
num_unique_values_to_show_in_vis = ColumnVisualisationUtils.MAX_UNIQUE_VALUES_TO_SHOW_IN_VIS - (0 if len(value_counts) == 3 else 1)
|
|
|
|
top_values = value_counts.iloc[:num_unique_values_to_show_in_vis].apply(lambda count: round(count / all_values * 100, 1)).to_dict()
|
|
if len(value_counts) == 3:
|
|
top_values[ColumnVisualisationUtils.TABLE_OCCURRENCES_COUNT_OTHER] = -1
|
|
else:
|
|
others_count = value_counts.iloc[num_unique_values_to_show_in_vis:].sum()
|
|
top_values[ColumnVisualisationUtils.TABLE_OCCURRENCES_COUNT_OTHER] = round(others_count / all_values * 100, 1)
|
|
result = add_custom_key_value_separator(top_values.items())
|
|
else:
|
|
vis_type = ColumnVisualisationType.UNIQUE
|
|
top_values = len(value_counts)
|
|
result = top_values
|
|
return vis_type, result
|
|
|
|
|
|
def analyze_numeric_column(column):
|
|
# todo: add tests for that
|
|
if column.dtype.kind in ['i', 'u']:
|
|
bins = np.bincount(column)
|
|
unique_values = np.count_nonzero(bins)
|
|
else:
|
|
# for float type we don't compute number of unique values because it's an
|
|
# expensive operation, just take number of elements in a column
|
|
unique_values = column.size
|
|
if unique_values <= ColumnVisualisationUtils.NUM_BINS:
|
|
res = column.value_counts().sort_index().to_dict()
|
|
else:
|
|
format_function = int if column.dtype.kind == 'i' else lambda x: round(x, 1)
|
|
counts, bin_edges = np.histogram(column.dropna(), bins=ColumnVisualisationUtils.NUM_BINS)
|
|
|
|
# so the long dash will be correctly viewed both on Mac and Windows
|
|
bin_labels = ['{} \u2014 {}'.format(format_function(bin_edges[i]), format_function(bin_edges[i+1])) for i in range(ColumnVisualisationUtils.NUM_BINS)]
|
|
bin_count_dict = {label: count for label, count in zip(bin_labels, counts)}
|
|
res = bin_count_dict
|
|
return add_custom_key_value_separator(res.items())
|
|
|
|
|
|
def add_custom_key_value_separator(pairs_list):
|
|
return ColumnVisualisationUtils.TABLE_OCCURRENCES_COUNT_NEXT_VALUE_SEPARATOR.join(
|
|
['{}{}{}'.format(key, ColumnVisualisationUtils.TABLE_OCCURRENCES_COUNT_DICT_SEPARATOR, value) for key, value in pairs_list]
|
|
)
|
|
|
|
|
|
# noinspection PyUnresolvedReferences
|
|
def __convert_to_df(table):
|
|
# type: (Union[pd.DataFrame, pd.Series, pd.Categorical]) -> pd.DataFrame
|
|
if type(table) is pd.Series:
|
|
return __series_to_df(table)
|
|
if type(table) is pd.Categorical:
|
|
return __categorical_to_df(table)
|
|
return table
|
|
|
|
|
|
# pandas.Series support
|
|
def __get_column_name(table):
|
|
# type: (pd.Series) -> str
|
|
if table.name is not None:
|
|
# noinspection PyTypeChecker
|
|
return table.name
|
|
return '<unnamed>'
|
|
|
|
|
|
def __series_to_df(table):
|
|
# type: (pd.Series) -> pd.DataFrame
|
|
return table.to_frame(name=__get_column_name(table))
|
|
|
|
|
|
# numpy.array support
|
|
def __array_to_df(table):
|
|
# type: (np.ndarray) -> pd.DataFrame
|
|
return pd.DataFrame(table)
|
|
|
|
|
|
def __categorical_to_df(table):
|
|
# type: (pd.Categorical) -> pd.DataFrame
|
|
return pd.DataFrame(table)
|
|
|
|
|
|
# In old versions of pandas max_colwidth accepted only Int-s
|
|
def __get_tables_display_options():
|
|
# type: () -> Tuple[None, Union[int, None]]
|
|
import sys
|
|
if sys.version_info < (3, 0):
|
|
return None, MAX_COLWIDTH_PYTHON_2
|
|
return None, None
|