mirror of
https://gitflic.ru/project/openide/openide.git
synced 2025-12-15 11:53:49 +07:00
DS-6230 Statistics Computations improvements
Improved performance for data computation for visualization for pandas for numeric types. Added restriction for the number of rows for data for which statistics will be computed (as the describe function takes too much time for data frames with such size or bigger) GitOrigin-RevId: 507ccfe63fb77b860febf94fe9584e0d7025a10f
This commit is contained in:
committed by
intellij-monorepo-bot
parent
475c105d35
commit
0656b7db65
@@ -121,6 +121,7 @@ class ColumnVisualisationType:
|
||||
UNIQUE = "unique"
|
||||
PERCENTAGE = "percentage"
|
||||
|
||||
|
||||
class ColumnVisualisationUtils:
|
||||
NUM_BINS = 20
|
||||
MAX_UNIQUE_VALUES_TO_SHOW_IN_VIS = 3
|
||||
@@ -131,6 +132,7 @@ class ColumnVisualisationUtils:
|
||||
TABLE_OCCURRENCES_COUNT_DICT_SEPARATOR = '__pydev_table_occurrences_count_dict__'
|
||||
TABLE_OCCURRENCES_COUNT_OTHER = '__pydev_table_other__'
|
||||
|
||||
|
||||
def get_value_occurrences_count(table):
|
||||
df = __convert_to_df(table)
|
||||
bin_counts = []
|
||||
@@ -186,7 +188,14 @@ def analyze_categorical_column(column):
|
||||
|
||||
|
||||
def analyze_numeric_column(column):
|
||||
unique_values = column.nunique()
|
||||
# todo: add tests for that
|
||||
if column.dtype.kind in ['i', 'u']:
|
||||
bins = np.bincount(column)
|
||||
unique_values = np.count_nonzero(bins)
|
||||
else:
|
||||
# for float type we don't compute number of unique values because it's an
|
||||
# expensive operation, just take number of elements in a column
|
||||
unique_values = column.size
|
||||
if unique_values <= ColumnVisualisationUtils.NUM_BINS:
|
||||
res = column.value_counts().sort_index().to_dict()
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user