[PyCharm Tables] PY-76723 Fixed Multiple problems in CSV serialization #PY-76723 Fixed

* see the YT issue for more details

(cherry picked from commit 676a021e221c430c6fd3b600640d7aec1503239e)


(cherry picked from commit 5c8a478a2cc3edb148b684b48dfaf85f9d50627f)

IJ-CR-147319

GitOrigin-RevId: 3d5caa7d348861bc904335db3ff0b1edef2a17b1
This commit is contained in:
Natalia.Murycheva
2024-10-19 01:23:56 +02:00
committed by intellij-monorepo-bot
parent d3c97e24b0
commit c347819edc
5 changed files with 102 additions and 37 deletions

View File

@@ -37,7 +37,7 @@ def get_data(table, use_csv_serialization, start_index=None, end_index=None, for
# type: (datasets.arrow_dataset.Dataset, int, int) -> str
def convert_data_to_csv(data):
return repr(data.to_csv(na_rep = "NaN"))
return repr(data.to_csv(na_rep = "NaN", float_format=format))
def convert_data_to_html(data):
return repr(data.to_html(notebook=True))

View File

@@ -53,7 +53,7 @@ def get_data(arr, use_csv_serialization, start_index=None, end_index=None, forma
return repr(_create_table(data, start_index, end_index, format).to_html(notebook=True))
def convert_data_to_csv(data):
return repr(_create_table(data, start_index, end_index, format).to_csv())
return repr(_create_table(data, start_index, end_index, format).to_csv(na_rep = "None", float_format=format))
if use_csv_serialization:
computed_data = _compute_data(arr, convert_data_to_csv, format)
@@ -74,7 +74,7 @@ def display_data_html(arr, start_index=None, end_index=None):
def display_data_csv(arr, start_index=None, end_index=None):
# type: (np.ndarray, int, int) -> None
def ipython_display(data):
print(_create_table(data, start_index, end_index).to_csv())
print(_create_table(data, start_index, end_index).to_csv(na_rep = "None"))
_compute_data(arr, ipython_display)
@@ -151,7 +151,7 @@ class _NpTable:
html.append('<tr>\n')
html.append('<th>{}</th>\n'.format(int(self.indexes[row_num])))
if self.type == ONE_DIM:
if self.format is not None:
if self.format is not None and self.array[row_num] is not None:
value = self.format % self.array[row_num]
else:
value = self.array[row_num]
@@ -160,7 +160,7 @@ class _NpTable:
cols = len(self.array[0])
max_cols = cols if max_cols is None else min(max_cols, cols)
for col_num in range(max_cols):
if self.format is not None:
if self.format is not None and self.array[row_num][col_num] is not None:
value = self.format % self.array[row_num][col_num]
else:
value = self.array[row_num][col_num]
@@ -170,11 +170,36 @@ class _NpTable:
return html
def to_csv(self):
def to_csv(self, na_rep = "None", float_format=None):
csv_stream = io.StringIO()
np.savetxt(csv_stream, self.array, delimiter=',')
np_array_without_nones = np.where(self.array == None, np.nan, self.array)
if float_format is None or float_format == 'null':
float_format = "%s"
np.savetxt(csv_stream, np_array_without_nones, delimiter=',', fmt=float_format)
csv_string = csv_stream.getvalue()
return csv_string
csv_rows_with_index = self._insert_index_at_rows_begging_csv(csv_string)
col_names = self._collect_col_names_csv()
return col_names + "\n" + csv_rows_with_index
def _insert_index_at_rows_begging_csv(self, csv_string):
# type: (str) -> str
csv_rows = csv_string.split('\n')
csv_rows_with_index = []
for row_index in range(self.array.shape[0]):
csv_rows_with_index.append(str(row_index) + "," + csv_rows[row_index])
return "\n".join(csv_rows_with_index)
def _collect_col_names_csv(self):
if self.type == ONE_DIM:
return ",0"
if self.type == WITH_TYPES:
return "," + ",".join(['{}'.format(name) for name in self.array.dtype.names])
# TWO_DIM
return "," + ",".join(['{}'.format(i) for i in range(self.array.shape[1])])
def slice(self, start_index=None, end_index=None):
@@ -259,10 +284,19 @@ def _create_table(command, start_index=None, end_index=None, format=None):
np_array = command
if is_pd:
sorting_arr = _sort_df(pd.DataFrame(np_array), sort_keys)
sorted_df = _sort_df(pd.DataFrame(np_array), sort_keys)
if start_index is not None and end_index is not None:
return sorting_arr.iloc[start_index:end_index]
return sorting_arr
sorted_df_slice = sorted_df.iloc[start_index:end_index]
# to apply "format" we should not have None inside DFs
try:
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
sorted_df_slice = sorted_df_slice.fillna("None")
except Exception as _:
pass
return sorted_df_slice
return sorted_df
return _NpTable(np_array, format=format).sort(sort_keys).slice(start_index, end_index)
@@ -295,8 +329,8 @@ def __get_tables_display_options():
try:
import pandas as pd
if int(pd.__version__.split('.')[0]) < 1:
return None, MAX_COLWIDTH_PYTHON_2, None
except ImportError:
return None, MAX_COLWIDTH, None
except Exception:
pass
return None, None, None

View File

@@ -45,12 +45,12 @@ def get_column_types(arr):
def get_data(arr, use_csv_serialization, start_index=None, end_index=None, format=None):
# type: (Union[np.ndarray, dict], int, int) -> str
# type: (Union[np.ndarray, dict], bool, Union[int, None], Union[int, None], Union[str, None]) -> str
def convert_data_to_html(data):
return repr(_create_table(data, start_index, end_index, format).to_html(notebook=True))
def convert_data_to_csv(data):
return repr(_create_table(data, start_index, end_index, format).to_csv())
return repr(_create_table(data, start_index, end_index, format).to_csv(na_rep = "None", float_format=format))
if use_csv_serialization:
computed_data = _compute_data(arr, convert_data_to_csv, format)
@@ -71,7 +71,7 @@ def display_data_html(arr, start_index=None, end_index=None):
def display_data_csv(arr, start_index=None, end_index=None):
# type: (np.ndarray, int, int) -> None
def ipython_display(data):
print(_create_table(data, start_index, end_index).to_csv())
print(_create_table(data, start_index, end_index).to_csv(na_rep = "None"))
_compute_data(arr, ipython_display)
@@ -84,13 +84,17 @@ class _NpTable:
self.format = format
def get_array_type(self):
if self.array.ndim > 1:
if len(self.array.shape) > 1:
return TWO_DIM
return ONE_DIM
def get_cols_types(self):
col_type = self.array.dtype
dtype = self.array.dtype
if "torch" in str(dtype):
col_type = dtype
else:
col_type = dtype.name
if self.type == ONE_DIM:
# [1, 2, 3] -> [int]
@@ -123,12 +127,6 @@ class _NpTable:
return "".join(html)
def to_csv(self):
csv_stream = io.StringIO()
np.savetxt(csv_stream, self.array, delimiter=',')
csv_string = csv_stream.getvalue()
return csv_string
def _collect_cols_names_html(self):
if self.type == ONE_DIM:
return ['<th>0</th>\n']
@@ -142,7 +140,8 @@ class _NpTable:
html.append('<tr>\n')
html.append('<th>{}</th>\n'.format(int(self.indexes[row_num])))
if self.type == ONE_DIM:
if self.format is not None:
# None usually is not supported in tensors, but to be totally sure
if self.format is not None and self.array[row_num] is not None:
value = self.format % self.array[row_num]
else:
value = self.array[row_num]
@@ -151,7 +150,7 @@ class _NpTable:
cols = len(self.array[0])
max_cols = cols if max_cols is None else min(max_cols, cols)
for col_num in range(max_cols):
if self.format is not None:
if self.format is not None and self.array[row_num][col_num]:
value = self.format % self.array[row_num][col_num]
else:
value = self.array[row_num][col_num]
@@ -160,6 +159,33 @@ class _NpTable:
html.append('</tbody>\n')
return html
def to_csv(self, na_rep = "None", float_format=None):
csv_stream = io.StringIO()
if float_format is None or float_format == 'null':
float_format = "%s"
np.savetxt(csv_stream, self.array, delimiter=',', fmt=float_format)
csv_string = csv_stream.getvalue()
csv_rows_with_index = self._insert_index_at_rows_begging_csv(csv_string)
col_names = self._collect_col_names_csv()
return col_names + "\n" + csv_rows_with_index
def _insert_index_at_rows_begging_csv(self, csv_string):
# type: (str) -> str
csv_rows = csv_string.split('\n')
csv_rows_with_index = []
for row_index in range(self.array.shape[0]):
csv_rows_with_index.append(str(row_index) + "," + csv_rows[row_index])
return "\n".join(csv_rows_with_index)
def _collect_col_names_csv(self):
if self.type == ONE_DIM:
return ",0"
# TWO_DIM
return "," + ",".join(['{}'.format(i) for i in range(self.array.shape[1])])
def slice(self, start_index=None, end_index=None):
if end_index is not None and start_index is not None:
self.array = self.array[start_index:end_index]
@@ -271,6 +297,12 @@ def __get_tables_display_options():
import sys
if sys.version_info < (3, 0):
return None, MAX_COLWIDTH, None
try:
import pandas as pd
if int(pd.__version__.split('.')[0]) < 1:
return None, MAX_COLWIDTH, None
except Exception:
pass
return None, None, None

View File

@@ -4,7 +4,7 @@ import pandas as pd
import typing
TABLE_TYPE_NEXT_VALUE_SEPARATOR = '__pydev_table_column_type_val__'
MAX_COLWIDTH_PYTHON_2 = 100000
MAX_COLWIDTH = 100000
def get_type(table):
@@ -38,7 +38,7 @@ def get_data(table, use_csv_serialization, start_index=None, end_index=None, for
# type: (Union[pd.DataFrame, pd.Series], int, int) -> str
def convert_data_to_csv(data):
return repr(__convert_to_df(data).to_csv(na_rep = "NaN"))
return repr(__convert_to_df(data).to_csv(na_rep = "NaN", float_format=format))
def convert_data_to_html(data):
return repr(__convert_to_df(data).to_html(notebook=True))
@@ -295,11 +295,10 @@ def __get_tables_display_options():
# type: () -> Tuple[None, Union[int, None], None]
import sys
if sys.version_info < (3, 0):
return None, MAX_COLWIDTH_PYTHON_2, None
return None, MAX_COLWIDTH, None
try:
import pandas as pd
if int(pd.__version__.split('.')[0]) < 1:
return None, MAX_COLWIDTH_PYTHON_2, None
return None, MAX_COLWIDTH, None
except ImportError:
pass
return None, None, None

View File

@@ -37,7 +37,8 @@ def get_data(table, use_csv_serialization, start_index=None, end_index=None, for
# type: (pl.DataFrame, int, int) -> str
with __create_config(format):
if use_csv_serialization:
return __get_df_slice(table, start_index, end_index).write_csv(null_value = "null")
float_precision = _get_float_precision(format)
return __get_df_slice(table, start_index, end_index).write_csv(null_value = "null", float_precision=float_precision)
return table[start_index:end_index]._repr_html_()
@@ -66,10 +67,9 @@ def __create_config(format=None):
cfg.set_tbl_cols(-1) # Unlimited
cfg.set_tbl_rows(-1) # Unlimited
cfg.set_fmt_str_lengths(MAX_COLWIDTH) # No option to set unlimited, so it's 100_000
if format is not None:
float_precision = _get_float_precision(format)
if float_precision is not None:
cfg.set_float_precision(float_precision)
float_precision = _get_float_precision(format)
if float_precision is not None:
cfg.set_float_precision(float_precision)
return cfg
@@ -219,7 +219,7 @@ def __get_describe(table):
def _get_float_precision(format):
# type: (str) -> Union[int, None]
# type: (Union[str, None]) -> Union[int, None]
if isinstance(format, str):
if format.startswith("%") and format.endswith("f"):
start = format.find('%.') + 2