diff --git a/notebooks/visualization/src/org/jetbrains/plugins/notebooks/visualization/outputs/statistic/NotebookOutputKeyType.kt b/notebooks/visualization/src/org/jetbrains/plugins/notebooks/visualization/outputs/statistic/NotebookOutputKeyType.kt index 063eacf815f4..91ce7284eaa7 100644 --- a/notebooks/visualization/src/org/jetbrains/plugins/notebooks/visualization/outputs/statistic/NotebookOutputKeyType.kt +++ b/notebooks/visualization/src/org/jetbrains/plugins/notebooks/visualization/outputs/statistic/NotebookOutputKeyType.kt @@ -21,6 +21,7 @@ enum class NotebookOutputKeyType { POLARS_SERIES, PYSPARK_TABLE, R_MARKDOWN, + HF_DATASET, SVG, SWING_COMPONENT, TEST, diff --git a/python/helpers/pydev/_pydevd_bundle/pydevd_tables.py b/python/helpers/pydev/_pydevd_bundle/pydevd_tables.py index 5089bda7bc23..a38f1dbcebd4 100644 --- a/python/helpers/pydev/_pydevd_bundle/pydevd_tables.py +++ b/python/helpers/pydev/_pydevd_bundle/pydevd_tables.py @@ -84,5 +84,7 @@ def __get_table_provider(output): type_qualified_name.endswith('DataFrame') or type_qualified_name.endswith('Series')): import _pydevd_bundle.tables.pydevd_polars as table_provider + elif type_qualified_name == 'datasets.arrow_dataset.Dataset': + import _pydevd_bundle.tables.pydevd_dataset as table_provider return table_provider diff --git a/python/helpers/pydev/_pydevd_bundle/pydevd_thrift.py b/python/helpers/pydev/_pydevd_bundle/pydevd_thrift.py index e7290c74f63e..f03142166496 100644 --- a/python/helpers/pydev/_pydevd_bundle/pydevd_thrift.py +++ b/python/helpers/pydev/_pydevd_bundle/pydevd_thrift.py @@ -420,6 +420,10 @@ def sparse_tensor_to_thrift_struct(tensor, name, roffset, coffset, rows, cols, f pass +def dataset_to_thrift_struct(dataset, name, roffset, coffset, rows, cols, format): + return dataframe_to_thrift_struct(dataset.to_pandas(), name, roffset, coffset, rows, cols, format) + + def array_to_meta_thrift_struct(array, name, format): type = array.dtype.kind slice = name @@ -621,6 +625,7 @@ TYPE_TO_THRIFT_STRUCT_CONVERTERS = { "Tensor": tensor_to_thrift_struct, "DataFrame": dataframe_to_thrift_struct, "Series": dataframe_to_thrift_struct, + "Dataset": dataset_to_thrift_struct, "GeoDataFrame": dataframe_to_thrift_struct, "GeoSeries": dataframe_to_thrift_struct } diff --git a/python/helpers/pydev/_pydevd_bundle/pydevd_vars.py b/python/helpers/pydev/_pydevd_bundle/pydevd_vars.py index dbe8d3dd50cd..31b8d07550b0 100644 --- a/python/helpers/pydev/_pydevd_bundle/pydevd_vars.py +++ b/python/helpers/pydev/_pydevd_bundle/pydevd_vars.py @@ -774,6 +774,9 @@ def dataframe_to_xml(df, name, roffset, coffset, rows, cols, format): xml += array_data_to_xml(rows, cols, formatted_row_elements, format) return xml +def dataset_to_xml(dataset, name, roffset, coffset, rows, cols, format): + return dataframe_to_xml(dataset.to_pandas(), name, roffset, coffset, rows, cols, format) + def array_data_to_xml(rows, cols, get_row, format): xml = "\n" % (rows, cols) @@ -820,7 +823,8 @@ TYPE_TO_XML_CONVERTERS = { "EagerTensor": tensor_to_xml, "ResourceVariable": tensor_to_xml, "SparseTensor": sparse_tensor_to_xml, - "Tensor": tensor_to_xml + "Tensor": tensor_to_xml, + "Dataset": dataset_to_xml } diff --git a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_dataset.py b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_dataset.py new file mode 100644 index 000000000000..961078b82b3e --- /dev/null +++ b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_dataset.py @@ -0,0 +1,102 @@ +# Copyright 2000-2023 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license. +import numpy as np +import pandas as pd +import typing + +TABLE_TYPE_NEXT_VALUE_SEPARATOR = '__pydev_table_column_type_val__' +MAX_COLWIDTH_PYTHON_2 = 100000 +BATCH_SIZE = 10000 + + +def get_type(table): + return str(type(table)) + + +# noinspection PyUnresolvedReferences +def get_shape(table): + return str(table.shape[0]) + + +# noinspection PyUnresolvedReferences +def get_head(table): + table = pd.concat(list(__convert_to_df(table)), ignore_index=True) + return repr(table.head().to_html(notebook=True, max_cols=None)) + + +# noinspection PyUnresolvedReferences +def get_column_types(table): + table = pd.concat(list(__convert_to_df(table)), ignore_index=True) + return str(table.index.dtype) + TABLE_TYPE_NEXT_VALUE_SEPARATOR + \ + TABLE_TYPE_NEXT_VALUE_SEPARATOR.join([str(t) for t in table.dtypes]) + + +# used by pydevd +# noinspection PyUnresolvedReferences +def get_data(table, start_index=None, end_index=None): + + def convert_data_to_html(data, max_cols): + return repr(data.to_html(notebook=True, max_cols=max_cols)) + + return _compute_sliced_data(table, convert_data_to_html, start_index, end_index) + + +# used by DSTableCommands +# noinspection PyUnresolvedReferences +def display_data(table, start_index, end_index): + def ipython_display(data, max_cols): + from IPython.display import display + display(data) + + _compute_sliced_data(table, ipython_display, start_index, end_index) + + +def __get_data_slice(table, start, end): + return table.select(range(start, end)).to_pandas() + + +def _compute_sliced_data(table, fun, start_index=None, end_index=None): + max_cols, max_colwidth = __get_tables_display_options() + + _jb_max_cols = pd.get_option('display.max_columns') + _jb_max_colwidth = pd.get_option('display.max_colwidth') + + pd.set_option('display.max_columns', max_cols) + pd.set_option('display.max_colwidth', max_colwidth) + + if start_index is not None and end_index is not None: + table = __get_data_slice(table, start_index, end_index) + else: + table = pd.concat(list(__convert_to_df(table)), ignore_index=True) + + data = fun(table, max_cols) + + pd.set_option('display.max_columns', _jb_max_cols) + pd.set_option('display.max_colwidth', _jb_max_colwidth) + + return data + + +# In old versions of pandas max_colwidth accepted only Int-s +def __get_tables_display_options(): + import sys + if sys.version_info < (3, 0): + return None, MAX_COLWIDTH_PYTHON_2 + return None, None + + +# noinspection PyUnresolvedReferences +def __convert_to_df(table): + try: + import datasets + if type(table) is datasets.arrow_dataset.Dataset: + return __dataset_to_df(table) + except ImportError as e: + pass + return table + + +def __dataset_to_df(dataset): + try: + return dataset.to_pandas(batched=True, batch_size=min(len(dataset), BATCH_SIZE)) + except ImportError as e: + pass \ No newline at end of file diff --git a/python/pydevSrc/src/com/jetbrains/python/debugger/PyDebugValue.java b/python/pydevSrc/src/com/jetbrains/python/debugger/PyDebugValue.java index 0e09a888beed..34919edcc559 100644 --- a/python/pydevSrc/src/com/jetbrains/python/debugger/PyDebugValue.java +++ b/python/pydevSrc/src/com/jetbrains/python/debugger/PyDebugValue.java @@ -42,7 +42,8 @@ public class PyDebugValue extends XNamedValue { DATA_FRAME, DATA_FRAME, SERIES, SERIES, "GeoDataFrame", DATA_FRAME, - "GeoSeries", SERIES + "GeoSeries", SERIES, + "Dataset", DATA_FRAME ); private static final int MAX_ITEMS_TO_HANDLE = 100; public static final int MAX_VALUE = 256; diff --git a/python/src/com/jetbrains/python/debugger/containerview/DataViewStrategy.java b/python/src/com/jetbrains/python/debugger/containerview/DataViewStrategy.java index 15783d447428..10173be6e835 100644 --- a/python/src/com/jetbrains/python/debugger/containerview/DataViewStrategy.java +++ b/python/src/com/jetbrains/python/debugger/containerview/DataViewStrategy.java @@ -25,6 +25,7 @@ public abstract class DataViewStrategy { ArrayViewStrategy.createInstanceForTensor(), DataFrameViewStrategy.createInstanceForDataFrame(), DataFrameViewStrategy.createInstanceForGeoDataFrame(), + DataFrameViewStrategy.createInstanceForDataset(), SeriesViewStrategy.createInstanceForSeries(), SeriesViewStrategy.createInstanceForGeoSeries() ); diff --git a/python/src/com/jetbrains/python/debugger/containerview/PyViewNumericContainerAction.java b/python/src/com/jetbrains/python/debugger/containerview/PyViewNumericContainerAction.java index d7af0bf8ef26..8f78f5ddd2b9 100644 --- a/python/src/com/jetbrains/python/debugger/containerview/PyViewNumericContainerAction.java +++ b/python/src/com/jetbrains/python/debugger/containerview/PyViewNumericContainerAction.java @@ -72,7 +72,7 @@ public class PyViewNumericContainerAction extends XDebuggerTreeActionBase { e.getPresentation().setText(PyBundle.message("debugger.numeric.view.as.array")); e.getPresentation().setVisible(true); } - else if ("DataFrame".equals(nodeType) || "GeoDataFrame".equals(nodeType)) { + else if ("DataFrame".equals(nodeType) || "GeoDataFrame".equals(nodeType) || "Dataset".equals(nodeType)) { e.getPresentation().setText(PyBundle.message("debugger.numeric.view.as.dataframe")); e.getPresentation().setVisible(true); } diff --git a/python/src/com/jetbrains/python/debugger/dataframe/DataFrameViewStrategy.java b/python/src/com/jetbrains/python/debugger/dataframe/DataFrameViewStrategy.java index 17ea888e07c6..7c36ae4e120b 100644 --- a/python/src/com/jetbrains/python/debugger/dataframe/DataFrameViewStrategy.java +++ b/python/src/com/jetbrains/python/debugger/dataframe/DataFrameViewStrategy.java @@ -24,6 +24,10 @@ public class DataFrameViewStrategy extends DataViewStrategy { return new DataFrameViewStrategy("GeoDataFrame"); } + public static @NotNull DataFrameViewStrategy createInstanceForDataset() { + return new DataFrameViewStrategy("Dataset"); + } + protected DataFrameViewStrategy(final @NotNull String typeName) { this.myTypeName = typeName; }