[pycharm] PY-71967 Add impl (except sorting in outputs -- to fix)

GitOrigin-RevId: ac079eefe71d7b32daa5e95eaef9c986ab842294
2025-12-16 22:51:17 +07:00 · 2024-06-24 01:22:20 +02:00
parent 58af8101d9
commit 2fc8ea1630
9 changed files with 123 additions and 3 deletions
--- a/notebooks/visualization/src/org/jetbrains/plugins/notebooks/visualization/outputs/statistic/NotebookOutputKeyType.kt
+++ b/notebooks/visualization/src/org/jetbrains/plugins/notebooks/visualization/outputs/statistic/NotebookOutputKeyType.kt
@@ -21,6 +21,7 @@ enum class NotebookOutputKeyType {
  POLARS_SERIES,
  PYSPARK_TABLE,
  R_MARKDOWN,
+  HF_DATASET,
  SVG,
  SWING_COMPONENT,
  TEST,
--- a/python/helpers/pydev/_pydevd_bundle/pydevd_tables.py
+++ b/python/helpers/pydev/_pydevd_bundle/pydevd_tables.py
@@ -84,5 +84,7 @@ def __get_table_provider(output):
            type_qualified_name.endswith('DataFrame')
            or type_qualified_name.endswith('Series')):
        import _pydevd_bundle.tables.pydevd_polars as table_provider
+    elif type_qualified_name == 'datasets.arrow_dataset.Dataset':
+        import _pydevd_bundle.tables.pydevd_dataset as table_provider

    return table_provider
--- a/python/helpers/pydev/_pydevd_bundle/pydevd_thrift.py
+++ b/python/helpers/pydev/_pydevd_bundle/pydevd_thrift.py
@@ -420,6 +420,10 @@ def sparse_tensor_to_thrift_struct(tensor, name, roffset, coffset, rows, cols, f
        pass


+def dataset_to_thrift_struct(dataset, name, roffset, coffset, rows, cols, format):
+    return dataframe_to_thrift_struct(dataset.to_pandas(), name, roffset, coffset, rows, cols, format)
+
+
 def array_to_meta_thrift_struct(array, name, format):
    type = array.dtype.kind
    slice = name
@@ -621,6 +625,7 @@ TYPE_TO_THRIFT_STRUCT_CONVERTERS = {
    "Tensor": tensor_to_thrift_struct,
    "DataFrame": dataframe_to_thrift_struct,
    "Series": dataframe_to_thrift_struct,
+    "Dataset": dataset_to_thrift_struct,
    "GeoDataFrame": dataframe_to_thrift_struct,
    "GeoSeries": dataframe_to_thrift_struct
 }
--- a/python/helpers/pydev/_pydevd_bundle/pydevd_vars.py
+++ b/python/helpers/pydev/_pydevd_bundle/pydevd_vars.py
@@ -774,6 +774,9 @@ def dataframe_to_xml(df, name, roffset, coffset, rows, cols, format):
    xml += array_data_to_xml(rows, cols, formatted_row_elements, format)
    return xml

+def dataset_to_xml(dataset, name, roffset, coffset, rows, cols, format):
+    return dataframe_to_xml(dataset.to_pandas(), name, roffset, coffset, rows, cols, format)
+

 def array_data_to_xml(rows, cols, get_row, format):
    xml = "<arraydata rows=\"%s\" cols=\"%s\"/>\n" % (rows, cols)
@@ -820,7 +823,8 @@ TYPE_TO_XML_CONVERTERS = {
    "EagerTensor": tensor_to_xml,
    "ResourceVariable": tensor_to_xml,
    "SparseTensor": sparse_tensor_to_xml,
-    "Tensor": tensor_to_xml
+    "Tensor": tensor_to_xml,
+    "Dataset": dataset_to_xml
 }


--- a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_dataset.py
+++ b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_dataset.py
@@ -0,0 +1,102 @@
+#  Copyright 2000-2023 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
+import numpy as np
+import pandas as pd
+import typing
+
+TABLE_TYPE_NEXT_VALUE_SEPARATOR = '__pydev_table_column_type_val__'
+MAX_COLWIDTH_PYTHON_2 = 100000
+BATCH_SIZE = 10000
+
+
+def get_type(table):
+    return str(type(table))
+
+
+# noinspection PyUnresolvedReferences
+def get_shape(table):
+    return str(table.shape[0])
+
+
+# noinspection PyUnresolvedReferences
+def get_head(table):
+    table = pd.concat(list(__convert_to_df(table)), ignore_index=True)
+    return repr(table.head().to_html(notebook=True, max_cols=None))
+
+
+# noinspection PyUnresolvedReferences
+def get_column_types(table):
+    table = pd.concat(list(__convert_to_df(table)), ignore_index=True)
+    return str(table.index.dtype) + TABLE_TYPE_NEXT_VALUE_SEPARATOR + \
+            TABLE_TYPE_NEXT_VALUE_SEPARATOR.join([str(t) for t in table.dtypes])
+
+
+# used by pydevd
+# noinspection PyUnresolvedReferences
+def get_data(table, start_index=None, end_index=None):
+
+    def convert_data_to_html(data, max_cols):
+        return repr(data.to_html(notebook=True, max_cols=max_cols))
+
+    return _compute_sliced_data(table, convert_data_to_html, start_index, end_index)
+
+
+# used by DSTableCommands
+# noinspection PyUnresolvedReferences
+def display_data(table, start_index, end_index):
+    def ipython_display(data, max_cols):
+        from IPython.display import display
+        display(data)
+
+    _compute_sliced_data(table, ipython_display, start_index, end_index)
+
+
+def __get_data_slice(table, start, end):
+    return table.select(range(start, end)).to_pandas()
+
+
+def _compute_sliced_data(table, fun, start_index=None, end_index=None):
+    max_cols, max_colwidth = __get_tables_display_options()
+
+    _jb_max_cols = pd.get_option('display.max_columns')
+    _jb_max_colwidth = pd.get_option('display.max_colwidth')
+
+    pd.set_option('display.max_columns', max_cols)
+    pd.set_option('display.max_colwidth', max_colwidth)
+
+    if start_index is not None and end_index is not None:
+        table = __get_data_slice(table, start_index, end_index)
+    else:
+        table = pd.concat(list(__convert_to_df(table)), ignore_index=True)
+
+    data = fun(table, max_cols)
+
+    pd.set_option('display.max_columns', _jb_max_cols)
+    pd.set_option('display.max_colwidth', _jb_max_colwidth)
+
+    return data
+
+
+# In old versions of pandas max_colwidth accepted only Int-s
+def __get_tables_display_options():
+    import sys
+    if sys.version_info < (3, 0):
+        return None, MAX_COLWIDTH_PYTHON_2
+    return None, None
+
+
+# noinspection PyUnresolvedReferences
+def __convert_to_df(table):
+    try:
+        import datasets
+        if type(table) is datasets.arrow_dataset.Dataset:
+            return __dataset_to_df(table)
+    except ImportError as e:
+        pass
+    return table
+
+
+def __dataset_to_df(dataset):
+    try:
+        return dataset.to_pandas(batched=True, batch_size=min(len(dataset), BATCH_SIZE))
+    except ImportError as e:
+        pass
--- a/python/pydevSrc/src/com/jetbrains/python/debugger/PyDebugValue.java
+++ b/python/pydevSrc/src/com/jetbrains/python/debugger/PyDebugValue.java
@@ -42,7 +42,8 @@ public class PyDebugValue extends XNamedValue {
    DATA_FRAME, DATA_FRAME,
    SERIES, SERIES,
    "GeoDataFrame", DATA_FRAME,
-    "GeoSeries", SERIES
+    "GeoSeries", SERIES,
+    "Dataset", DATA_FRAME
  );
  private static final int MAX_ITEMS_TO_HANDLE = 100;
  public static final int MAX_VALUE = 256;
--- a/python/src/com/jetbrains/python/debugger/containerview/DataViewStrategy.java
+++ b/python/src/com/jetbrains/python/debugger/containerview/DataViewStrategy.java
@@ -25,6 +25,7 @@ public abstract class DataViewStrategy {
      ArrayViewStrategy.createInstanceForTensor(),
      DataFrameViewStrategy.createInstanceForDataFrame(),
      DataFrameViewStrategy.createInstanceForGeoDataFrame(),
+      DataFrameViewStrategy.createInstanceForDataset(),
      SeriesViewStrategy.createInstanceForSeries(),
      SeriesViewStrategy.createInstanceForGeoSeries()
    );
--- a/python/src/com/jetbrains/python/debugger/containerview/PyViewNumericContainerAction.java
+++ b/python/src/com/jetbrains/python/debugger/containerview/PyViewNumericContainerAction.java
@@ -72,7 +72,7 @@ public class PyViewNumericContainerAction extends XDebuggerTreeActionBase {
      e.getPresentation().setText(PyBundle.message("debugger.numeric.view.as.array"));
      e.getPresentation().setVisible(true);
    }
-    else if ("DataFrame".equals(nodeType) || "GeoDataFrame".equals(nodeType)) {
+    else if ("DataFrame".equals(nodeType) || "GeoDataFrame".equals(nodeType) || "Dataset".equals(nodeType)) {
      e.getPresentation().setText(PyBundle.message("debugger.numeric.view.as.dataframe"));
      e.getPresentation().setVisible(true);
    }
--- a/python/src/com/jetbrains/python/debugger/dataframe/DataFrameViewStrategy.java
+++ b/python/src/com/jetbrains/python/debugger/dataframe/DataFrameViewStrategy.java
@@ -24,6 +24,10 @@ public class DataFrameViewStrategy extends DataViewStrategy {
    return new DataFrameViewStrategy("GeoDataFrame");
  }

+  public static @NotNull DataFrameViewStrategy createInstanceForDataset() {
+    return new DataFrameViewStrategy("Dataset");
+  }
+
  protected DataFrameViewStrategy(final @NotNull String typeName) {
    this.myTypeName = typeName;
  }