DS-6230 Statistics Computations improvements

Improved performance for data computation for visualization for pandas for numeric types. Added restriction for the number of rows for data for which statistics will be computed (as the describe function takes too much time for data frames with such size or bigger) GitOrigin-RevId: 507ccfe63fb77b860febf94fe9584e0d7025a10f
2025-12-15 11:53:49 +07:00 · 2024-02-16 16:45:38 +01:00
parent 475c105d35
commit 0656b7db65
1 changed files with 10 additions and 1 deletions
--- a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_pandas.py
+++ b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_pandas.py
@@ -121,6 +121,7 @@ class ColumnVisualisationType:
    UNIQUE = "unique"
    PERCENTAGE = "percentage"

+
 class ColumnVisualisationUtils:
    NUM_BINS = 20
    MAX_UNIQUE_VALUES_TO_SHOW_IN_VIS = 3
@@ -131,6 +132,7 @@ class ColumnVisualisationUtils:
    TABLE_OCCURRENCES_COUNT_DICT_SEPARATOR = '__pydev_table_occurrences_count_dict__'
    TABLE_OCCURRENCES_COUNT_OTHER = '__pydev_table_other__'

+
 def get_value_occurrences_count(table):
    df = __convert_to_df(table)
    bin_counts = []
@@ -186,7 +188,14 @@ def analyze_categorical_column(column):


 def analyze_numeric_column(column):
-    unique_values = column.nunique()
+    # todo: add tests for that
+    if column.dtype.kind in ['i', 'u']:
+        bins = np.bincount(column)
+        unique_values = np.count_nonzero(bins)
+    else:
+        # for float type we don't compute number of unique values because it's an
+        # expensive operation, just take number of elements in a column
+        unique_values = column.size
    if unique_values <= ColumnVisualisationUtils.NUM_BINS:
        res = column.value_counts().sort_index().to_dict()
    else: