From c347819edc99288352ea49ef7501db99f7b9c0d8 Mon Sep 17 00:00:00 2001 From: "Natalia.Murycheva" Date: Sat, 19 Oct 2024 01:23:56 +0200 Subject: [PATCH] [PyCharm Tables] PY-76723 Fixed Multiple problems in CSV serialization #PY-76723 Fixed * see the YT issue for more details (cherry picked from commit 676a021e221c430c6fd3b600640d7aec1503239e) (cherry picked from commit 5c8a478a2cc3edb148b684b48dfaf85f9d50627f) IJ-CR-147319 GitOrigin-RevId: 3d5caa7d348861bc904335db3ff0b1edef2a17b1 --- .../_pydevd_bundle/tables/pydevd_dataset.py | 2 +- .../_pydevd_bundle/tables/pydevd_numpy.py | 58 +++++++++++++++---- .../tables/pydevd_numpy_based.py | 58 ++++++++++++++----- .../_pydevd_bundle/tables/pydevd_pandas.py | 9 ++- .../_pydevd_bundle/tables/pydevd_polars.py | 12 ++-- 5 files changed, 102 insertions(+), 37 deletions(-) diff --git a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_dataset.py b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_dataset.py index c8da839fff96..192bc2f8adea 100644 --- a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_dataset.py +++ b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_dataset.py @@ -37,7 +37,7 @@ def get_data(table, use_csv_serialization, start_index=None, end_index=None, for # type: (datasets.arrow_dataset.Dataset, int, int) -> str def convert_data_to_csv(data): - return repr(data.to_csv(na_rep = "NaN")) + return repr(data.to_csv(na_rep = "NaN", float_format=format)) def convert_data_to_html(data): return repr(data.to_html(notebook=True)) diff --git a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_numpy.py b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_numpy.py index 25ff8933eb18..f396b9c663dc 100644 --- a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_numpy.py +++ b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_numpy.py @@ -53,7 +53,7 @@ def get_data(arr, use_csv_serialization, start_index=None, end_index=None, forma return repr(_create_table(data, start_index, end_index, format).to_html(notebook=True)) def convert_data_to_csv(data): - return repr(_create_table(data, start_index, end_index, format).to_csv()) + return repr(_create_table(data, start_index, end_index, format).to_csv(na_rep = "None", float_format=format)) if use_csv_serialization: computed_data = _compute_data(arr, convert_data_to_csv, format) @@ -74,7 +74,7 @@ def display_data_html(arr, start_index=None, end_index=None): def display_data_csv(arr, start_index=None, end_index=None): # type: (np.ndarray, int, int) -> None def ipython_display(data): - print(_create_table(data, start_index, end_index).to_csv()) + print(_create_table(data, start_index, end_index).to_csv(na_rep = "None")) _compute_data(arr, ipython_display) @@ -151,7 +151,7 @@ class _NpTable: html.append('\n') html.append('{}\n'.format(int(self.indexes[row_num]))) if self.type == ONE_DIM: - if self.format is not None: + if self.format is not None and self.array[row_num] is not None: value = self.format % self.array[row_num] else: value = self.array[row_num] @@ -160,7 +160,7 @@ class _NpTable: cols = len(self.array[0]) max_cols = cols if max_cols is None else min(max_cols, cols) for col_num in range(max_cols): - if self.format is not None: + if self.format is not None and self.array[row_num][col_num] is not None: value = self.format % self.array[row_num][col_num] else: value = self.array[row_num][col_num] @@ -170,11 +170,36 @@ class _NpTable: return html - def to_csv(self): + def to_csv(self, na_rep = "None", float_format=None): csv_stream = io.StringIO() - np.savetxt(csv_stream, self.array, delimiter=',') + np_array_without_nones = np.where(self.array == None, np.nan, self.array) + if float_format is None or float_format == 'null': + float_format = "%s" + + np.savetxt(csv_stream, np_array_without_nones, delimiter=',', fmt=float_format) csv_string = csv_stream.getvalue() - return csv_string + csv_rows_with_index = self._insert_index_at_rows_begging_csv(csv_string) + + col_names = self._collect_col_names_csv() + return col_names + "\n" + csv_rows_with_index + + def _insert_index_at_rows_begging_csv(self, csv_string): + # type: (str) -> str + csv_rows = csv_string.split('\n') + csv_rows_with_index = [] + for row_index in range(self.array.shape[0]): + csv_rows_with_index.append(str(row_index) + "," + csv_rows[row_index]) + return "\n".join(csv_rows_with_index) + + def _collect_col_names_csv(self): + if self.type == ONE_DIM: + return ",0" + + if self.type == WITH_TYPES: + return "," + ",".join(['{}'.format(name) for name in self.array.dtype.names]) + + # TWO_DIM + return "," + ",".join(['{}'.format(i) for i in range(self.array.shape[1])]) def slice(self, start_index=None, end_index=None): @@ -259,10 +284,19 @@ def _create_table(command, start_index=None, end_index=None, format=None): np_array = command if is_pd: - sorting_arr = _sort_df(pd.DataFrame(np_array), sort_keys) + sorted_df = _sort_df(pd.DataFrame(np_array), sort_keys) if start_index is not None and end_index is not None: - return sorting_arr.iloc[start_index:end_index] - return sorting_arr + sorted_df_slice = sorted_df.iloc[start_index:end_index] + # to apply "format" we should not have None inside DFs + try: + import warnings + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + sorted_df_slice = sorted_df_slice.fillna("None") + except Exception as _: + pass + return sorted_df_slice + return sorted_df return _NpTable(np_array, format=format).sort(sort_keys).slice(start_index, end_index) @@ -295,8 +329,8 @@ def __get_tables_display_options(): try: import pandas as pd if int(pd.__version__.split('.')[0]) < 1: - return None, MAX_COLWIDTH_PYTHON_2, None - except ImportError: + return None, MAX_COLWIDTH, None + except Exception: pass return None, None, None diff --git a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_numpy_based.py b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_numpy_based.py index 92c107355017..d381d450a8c5 100644 --- a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_numpy_based.py +++ b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_numpy_based.py @@ -45,12 +45,12 @@ def get_column_types(arr): def get_data(arr, use_csv_serialization, start_index=None, end_index=None, format=None): - # type: (Union[np.ndarray, dict], int, int) -> str + # type: (Union[np.ndarray, dict], bool, Union[int, None], Union[int, None], Union[str, None]) -> str def convert_data_to_html(data): return repr(_create_table(data, start_index, end_index, format).to_html(notebook=True)) def convert_data_to_csv(data): - return repr(_create_table(data, start_index, end_index, format).to_csv()) + return repr(_create_table(data, start_index, end_index, format).to_csv(na_rep = "None", float_format=format)) if use_csv_serialization: computed_data = _compute_data(arr, convert_data_to_csv, format) @@ -71,7 +71,7 @@ def display_data_html(arr, start_index=None, end_index=None): def display_data_csv(arr, start_index=None, end_index=None): # type: (np.ndarray, int, int) -> None def ipython_display(data): - print(_create_table(data, start_index, end_index).to_csv()) + print(_create_table(data, start_index, end_index).to_csv(na_rep = "None")) _compute_data(arr, ipython_display) @@ -84,13 +84,17 @@ class _NpTable: self.format = format def get_array_type(self): - if self.array.ndim > 1: + if len(self.array.shape) > 1: return TWO_DIM return ONE_DIM def get_cols_types(self): - col_type = self.array.dtype + dtype = self.array.dtype + if "torch" in str(dtype): + col_type = dtype + else: + col_type = dtype.name if self.type == ONE_DIM: # [1, 2, 3] -> [int] @@ -123,12 +127,6 @@ class _NpTable: return "".join(html) - def to_csv(self): - csv_stream = io.StringIO() - np.savetxt(csv_stream, self.array, delimiter=',') - csv_string = csv_stream.getvalue() - return csv_string - def _collect_cols_names_html(self): if self.type == ONE_DIM: return ['0\n'] @@ -142,7 +140,8 @@ class _NpTable: html.append('\n') html.append('{}\n'.format(int(self.indexes[row_num]))) if self.type == ONE_DIM: - if self.format is not None: + # None usually is not supported in tensors, but to be totally sure + if self.format is not None and self.array[row_num] is not None: value = self.format % self.array[row_num] else: value = self.array[row_num] @@ -151,7 +150,7 @@ class _NpTable: cols = len(self.array[0]) max_cols = cols if max_cols is None else min(max_cols, cols) for col_num in range(max_cols): - if self.format is not None: + if self.format is not None and self.array[row_num][col_num]: value = self.format % self.array[row_num][col_num] else: value = self.array[row_num][col_num] @@ -160,6 +159,33 @@ class _NpTable: html.append('\n') return html + def to_csv(self, na_rep = "None", float_format=None): + csv_stream = io.StringIO() + if float_format is None or float_format == 'null': + float_format = "%s" + + np.savetxt(csv_stream, self.array, delimiter=',', fmt=float_format) + csv_string = csv_stream.getvalue() + csv_rows_with_index = self._insert_index_at_rows_begging_csv(csv_string) + + col_names = self._collect_col_names_csv() + return col_names + "\n" + csv_rows_with_index + + def _insert_index_at_rows_begging_csv(self, csv_string): + # type: (str) -> str + csv_rows = csv_string.split('\n') + csv_rows_with_index = [] + for row_index in range(self.array.shape[0]): + csv_rows_with_index.append(str(row_index) + "," + csv_rows[row_index]) + return "\n".join(csv_rows_with_index) + + def _collect_col_names_csv(self): + if self.type == ONE_DIM: + return ",0" + + # TWO_DIM + return "," + ",".join(['{}'.format(i) for i in range(self.array.shape[1])]) + def slice(self, start_index=None, end_index=None): if end_index is not None and start_index is not None: self.array = self.array[start_index:end_index] @@ -271,6 +297,12 @@ def __get_tables_display_options(): import sys if sys.version_info < (3, 0): return None, MAX_COLWIDTH, None + try: + import pandas as pd + if int(pd.__version__.split('.')[0]) < 1: + return None, MAX_COLWIDTH, None + except Exception: + pass return None, None, None diff --git a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_pandas.py b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_pandas.py index 755ecb9bb17b..03f532935ebc 100644 --- a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_pandas.py +++ b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_pandas.py @@ -4,7 +4,7 @@ import pandas as pd import typing TABLE_TYPE_NEXT_VALUE_SEPARATOR = '__pydev_table_column_type_val__' -MAX_COLWIDTH_PYTHON_2 = 100000 +MAX_COLWIDTH = 100000 def get_type(table): @@ -38,7 +38,7 @@ def get_data(table, use_csv_serialization, start_index=None, end_index=None, for # type: (Union[pd.DataFrame, pd.Series], int, int) -> str def convert_data_to_csv(data): - return repr(__convert_to_df(data).to_csv(na_rep = "NaN")) + return repr(__convert_to_df(data).to_csv(na_rep = "NaN", float_format=format)) def convert_data_to_html(data): return repr(__convert_to_df(data).to_html(notebook=True)) @@ -295,11 +295,10 @@ def __get_tables_display_options(): # type: () -> Tuple[None, Union[int, None], None] import sys if sys.version_info < (3, 0): - return None, MAX_COLWIDTH_PYTHON_2, None + return None, MAX_COLWIDTH, None try: - import pandas as pd if int(pd.__version__.split('.')[0]) < 1: - return None, MAX_COLWIDTH_PYTHON_2, None + return None, MAX_COLWIDTH, None except ImportError: pass return None, None, None diff --git a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_polars.py b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_polars.py index 9d37d67c54ab..01fa7c4671c4 100644 --- a/python/helpers/pydev/_pydevd_bundle/tables/pydevd_polars.py +++ b/python/helpers/pydev/_pydevd_bundle/tables/pydevd_polars.py @@ -37,7 +37,8 @@ def get_data(table, use_csv_serialization, start_index=None, end_index=None, for # type: (pl.DataFrame, int, int) -> str with __create_config(format): if use_csv_serialization: - return __get_df_slice(table, start_index, end_index).write_csv(null_value = "null") + float_precision = _get_float_precision(format) + return __get_df_slice(table, start_index, end_index).write_csv(null_value = "null", float_precision=float_precision) return table[start_index:end_index]._repr_html_() @@ -66,10 +67,9 @@ def __create_config(format=None): cfg.set_tbl_cols(-1) # Unlimited cfg.set_tbl_rows(-1) # Unlimited cfg.set_fmt_str_lengths(MAX_COLWIDTH) # No option to set unlimited, so it's 100_000 - if format is not None: - float_precision = _get_float_precision(format) - if float_precision is not None: - cfg.set_float_precision(float_precision) + float_precision = _get_float_precision(format) + if float_precision is not None: + cfg.set_float_precision(float_precision) return cfg @@ -219,7 +219,7 @@ def __get_describe(table): def _get_float_precision(format): - # type: (str) -> Union[int, None] + # type: (Union[str, None]) -> Union[int, None] if isinstance(format, str): if format.startswith("%") and format.endswith("f"): start = format.find('%.') + 2