DS-6230 Statistics Computations improvements

Improved performance for data computation for visualization for pandas for numeric types.
Added restriction for the number of rows for data for which statistics will be computed (as the describe function takes too much time for data frames with such size or bigger)

GitOrigin-RevId: 507ccfe63fb77b860febf94fe9584e0d7025a10f
This commit is contained in:
Natalia.Murycheva
2024-02-16 16:45:38 +01:00
committed by intellij-monorepo-bot
parent 475c105d35
commit 0656b7db65

View File

@@ -121,6 +121,7 @@ class ColumnVisualisationType:
UNIQUE = "unique"
PERCENTAGE = "percentage"
class ColumnVisualisationUtils:
NUM_BINS = 20
MAX_UNIQUE_VALUES_TO_SHOW_IN_VIS = 3
@@ -131,6 +132,7 @@ class ColumnVisualisationUtils:
TABLE_OCCURRENCES_COUNT_DICT_SEPARATOR = '__pydev_table_occurrences_count_dict__'
TABLE_OCCURRENCES_COUNT_OTHER = '__pydev_table_other__'
def get_value_occurrences_count(table):
df = __convert_to_df(table)
bin_counts = []
@@ -186,7 +188,14 @@ def analyze_categorical_column(column):
def analyze_numeric_column(column):
unique_values = column.nunique()
# todo: add tests for that
if column.dtype.kind in ['i', 'u']:
bins = np.bincount(column)
unique_values = np.count_nonzero(bins)
else:
# for float type we don't compute number of unique values because it's an
# expensive operation, just take number of elements in a column
unique_values = column.size
if unique_values <= ColumnVisualisationUtils.NUM_BINS:
res = column.value_counts().sort_index().to_dict()
else: