mirror of
https://gitflic.ru/project/openide/openide.git
synced 2025-12-16 14:23:28 +07:00
(cherry picked from commit c1c344c53e2ec1ad91e423f59c4f062c25678f39) IJ-CR-166515 GitOrigin-RevId: 152abd4bbb6df85b9fa3a373f8992898864348e8
723 lines
27 KiB
Python
723 lines
27 KiB
Python
# Copyright 2000-2023 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
|
|
"""
|
|
Here we test aux methods for pandas tables handling, namely,
|
|
check functions from _pydevd_bundle.tables.pydevd_pandas module.
|
|
"""
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
import sys
|
|
|
|
try:
|
|
from StringIO import StringIO
|
|
except ImportError:
|
|
from io import StringIO
|
|
from IPython.display import HTML
|
|
|
|
import _pydevd_bundle.tables.pydevd_pandas as pandas_tables_helpers
|
|
from _pydevd_bundle.pydevd_constants import NEXT_VALUE_SEPARATOR
|
|
TYPE_BOOL, TYPE_NUMERIC, TYPE_CATEGORICAL = "bool", "numeric", "categorical"
|
|
test_data_dir = 'python_' + str(sys.version_info[0]) + '_' + str(sys.version_info[1])
|
|
|
|
|
|
@pytest.fixture
|
|
def setup_dataframe():
|
|
"""
|
|
Here we create a fixture for tests that are related to DataFrames.
|
|
Also, we create other auxiliary data
|
|
"""
|
|
rows_number = 4
|
|
df = pd.DataFrame(
|
|
{
|
|
"A": 1.0,
|
|
"B": "foo",
|
|
"C": [None] * rows_number,
|
|
"D": [1 + 20j] * rows_number,
|
|
"E": [1 + 20j] * rows_number,
|
|
"F": [None, "bar", 2., 1 + 10j],
|
|
"G": [None, "bar", 2., 1 + 10j],
|
|
"H": [True, False] * (rows_number // 2),
|
|
"bool_with_nan": [True, False, False, None],
|
|
"I": pd.Timestamp("20130102"),
|
|
"J": pd.Series(1, index=list(range(rows_number)),
|
|
dtype="float32"),
|
|
"K": pd.Series(range(rows_number),
|
|
index=list(range(rows_number)),
|
|
dtype="int32"),
|
|
"L": pd.Categorical(["test", "train"] * (rows_number // 2)),
|
|
"dates": pd.date_range("now", periods=rows_number),
|
|
"datetime64[ns]": pd.Timestamp("20010102"),
|
|
"datetime64[ns, <tz>]": pd.date_range("20130101", periods=rows_number,
|
|
tz="CET"),
|
|
"period": pd.Period('2012-1-1', freq='D'),
|
|
"category": pd.Series(list("ABCD")).astype("category"),
|
|
"interval": pd.interval_range(start=pd.Timestamp("2017-01-01"),
|
|
periods=rows_number, freq="W"),
|
|
"lists": [[1, 2], [1, 2], [3, 4], [4, 5]],
|
|
"dicts": [{1: 2}, {1: 2}, {3: 4}, {4, 5}],
|
|
"tuples": [(1, 2), (1, 2), (3, 4), (4, 5)],
|
|
}
|
|
)
|
|
df['datetime64[ns]'] = df['datetime64[ns]'].astype("datetime64[ns]")
|
|
df['I'] = df['I'].astype("datetime64[ns]")
|
|
df_html = repr(df.head(1).to_html(notebook=True, max_cols=None))
|
|
columns_types = [str(df.index.dtype)] + [str(t) for t in df.dtypes]
|
|
|
|
col_name_to_data_type = {
|
|
"A": TYPE_NUMERIC,
|
|
"B": TYPE_CATEGORICAL,
|
|
"C": [None] * rows_number,
|
|
"D": TYPE_CATEGORICAL,
|
|
"E": TYPE_CATEGORICAL,
|
|
"F": TYPE_CATEGORICAL,
|
|
"G": TYPE_CATEGORICAL,
|
|
"H": TYPE_BOOL,
|
|
"bool_with_nan": TYPE_CATEGORICAL,
|
|
"I": TYPE_CATEGORICAL,
|
|
"J": TYPE_NUMERIC,
|
|
"K": TYPE_NUMERIC,
|
|
"L": TYPE_CATEGORICAL,
|
|
"dates": TYPE_CATEGORICAL,
|
|
"datetime64[ns]": TYPE_CATEGORICAL,
|
|
"datetime64[ns, <tz>]": TYPE_CATEGORICAL,
|
|
"period": TYPE_CATEGORICAL,
|
|
"category": TYPE_CATEGORICAL,
|
|
"interval": TYPE_CATEGORICAL,
|
|
"lists": TYPE_CATEGORICAL,
|
|
"dicts": TYPE_CATEGORICAL,
|
|
"tuples": TYPE_CATEGORICAL,
|
|
}
|
|
|
|
return rows_number, df, df_html, columns_types, col_name_to_data_type
|
|
|
|
|
|
@pytest.fixture
|
|
def setup_series_no_names():
|
|
"""
|
|
Here we create a fixture for tests that are related to Series without a name.
|
|
"""
|
|
return pd.Series([1, 2, 3])
|
|
|
|
|
|
@pytest.fixture
|
|
def setup_dataframe_many_columns():
|
|
"""
|
|
Here we create a fixture for tests that are related to DataFrames.
|
|
We check that we don't miss columns for big dataframes
|
|
"""
|
|
return pd.read_csv('test_data/pandas/dataframe_many_columns_before.csv')
|
|
|
|
|
|
@pytest.fixture
|
|
def setup_df_with_big_int_values():
|
|
"""
|
|
Here we create a fixture for one test.
|
|
With that df we check that we catch OverflowError exception in the describe functions.
|
|
This number has to be so big.
|
|
"""
|
|
big_int = 555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555
|
|
df = pd.DataFrame({"BitIntValues": [1, 2]})
|
|
df["BitIntValues"] = big_int
|
|
|
|
return df
|
|
|
|
|
|
@pytest.fixture
|
|
def setup_dataframe_with_float_values():
|
|
if test_data_dir.startswith("python_2"):
|
|
df = pd.DataFrame({
|
|
"int_col": [1, 2, 3],
|
|
"float_col": [1.0, 2.0, None],
|
|
"strings": ["f", "s", None],
|
|
"list": [[1.1, 2.2], [2.2, 3.3], [4.4, None]],
|
|
"complex": [1.0 + 2j, 2.2 + 3j, 4.4 + 5j]
|
|
})
|
|
else:
|
|
df = pd.DataFrame({
|
|
"int_col": [1, 2, 3],
|
|
"float_col": [1.0, 2.0, None],
|
|
"strings": ["f", "s", None],
|
|
"dict": [{"age": 30, "height": 5.5},
|
|
{"age": 25, "height": 6.1},
|
|
{"age": 35, "height": None}],
|
|
"list": [[1.1, 2.2], [2.2, 3.3], [4.4, None]],
|
|
"complex": [1.0 + 2j, 2.2 + 3j, 4.4 + 5j]
|
|
})
|
|
|
|
return df
|
|
|
|
|
|
# 1
|
|
def test_info_command(setup_dataframe):
|
|
"""
|
|
Here we check the correctness of info command that is invoked via Kotlin.
|
|
:param setup_dataframe: fixture/data for the test
|
|
"""
|
|
rows, df, df_html, cols_types_expected, _ = setup_dataframe
|
|
|
|
cols_types_actual = pandas_tables_helpers.get_column_types(df)
|
|
cols_types_actual = cols_types_actual.split(pandas_tables_helpers.TABLE_TYPE_NEXT_VALUE_SEPARATOR)
|
|
|
|
assert pandas_tables_helpers.get_type(df) == str(pd.DataFrame)
|
|
assert pandas_tables_helpers.get_shape(df) == str(rows)
|
|
assert pandas_tables_helpers.get_head(df) == df_html
|
|
assert cols_types_actual == cols_types_expected
|
|
|
|
|
|
# 2
|
|
def test_get_data_saves_display_options(setup_dataframe):
|
|
"""
|
|
We check that we don't ruin a user's display options.
|
|
:param setup_dataframe: fixture/data for the test
|
|
"""
|
|
_, df, _, _, _ = setup_dataframe
|
|
|
|
max_columns_before = pd.get_option('display.max_columns')
|
|
max_colwidth_before = pd.get_option('display.max_colwidth')
|
|
max_rows_before = pd.get_option('display.max_rows')
|
|
|
|
pandas_tables_helpers.get_data(df, False, format="%.2f")
|
|
|
|
max_columns_after = pd.get_option('display.max_columns')
|
|
max_colwidth_after = pd.get_option('display.max_colwidth')
|
|
max_rows_after = pd.get_option('display.max_rows')
|
|
|
|
assert max_columns_before == max_columns_after
|
|
assert max_colwidth_before == max_colwidth_after
|
|
assert max_rows_before == max_rows_after
|
|
|
|
|
|
# 3
|
|
def test_display_html_saves_display_options(setup_dataframe):
|
|
"""
|
|
We check that we don't ruin a user's display options.
|
|
:param setup_dataframe: fixture/data for the test
|
|
"""
|
|
_, df, _, _, _ = setup_dataframe
|
|
|
|
max_columns_before = pd.get_option('display.max_columns')
|
|
max_colwidth_before = pd.get_option('display.max_colwidth')
|
|
max_rows_before = pd.get_option('display.max_rows')
|
|
|
|
pandas_tables_helpers.display_data_html(df, start_index=0, end_index=2)
|
|
|
|
max_columns_after = pd.get_option('display.max_columns')
|
|
max_colwidth_after = pd.get_option('display.max_colwidth')
|
|
max_rows_after = pd.get_option('display.max_rows')
|
|
|
|
assert max_columns_before == max_columns_after
|
|
assert max_colwidth_before == max_colwidth_after
|
|
assert max_rows_before == max_rows_after
|
|
|
|
|
|
# 4
|
|
def test_display_csv_saves_display_options(setup_dataframe):
|
|
"""
|
|
We check that we don't ruin a user's display options.
|
|
:param setup_dataframe: fixture/data for the test
|
|
"""
|
|
_, df, _, _, _ = setup_dataframe
|
|
|
|
max_columns_before = pd.get_option('display.max_columns')
|
|
max_colwidth_before = pd.get_option('display.max_colwidth')
|
|
max_rows_before = pd.get_option('display.max_rows')
|
|
|
|
pandas_tables_helpers.display_data_csv(df, start_index=0, end_index=2)
|
|
|
|
max_columns_after = pd.get_option('display.max_columns')
|
|
max_colwidth_after = pd.get_option('display.max_colwidth')
|
|
max_rows_after = pd.get_option('display.max_rows')
|
|
|
|
assert max_columns_before == max_columns_after
|
|
assert max_colwidth_before == max_colwidth_after
|
|
assert max_rows_before == max_rows_after
|
|
|
|
|
|
# 5
|
|
def test_convert_to_df_unnamed_series(setup_series_no_names):
|
|
"""
|
|
In this test we check two methods: __convert_to_df and __get_column_name.
|
|
For unnamed pd.Series case.
|
|
:param setup_series_no_names: fixture/data for the test
|
|
"""
|
|
converted_series = pandas_tables_helpers.__convert_to_df(setup_series_no_names)
|
|
|
|
assert isinstance(converted_series, pd.DataFrame)
|
|
assert converted_series.columns[0] == '<unnamed>'
|
|
|
|
|
|
# 6
|
|
def test_convert_to_df_common_series(setup_dataframe):
|
|
"""
|
|
In this test we check two methods: __convert_to_df and __get_column_name.
|
|
For a common pd.Series case.
|
|
:param setup_dataframe: fixture/data for the test
|
|
"""
|
|
_, df, _, _, _ = setup_dataframe
|
|
for col in df.columns:
|
|
converted_series = pandas_tables_helpers.__convert_to_df(df[col])
|
|
|
|
assert isinstance(converted_series, pd.DataFrame)
|
|
assert converted_series.columns[0] == col
|
|
|
|
|
|
# 7
|
|
@pytest.mark.skipif(sys.version_info < (3, 0),
|
|
reason="TODO: investigate pd.Categorical/complex cases")
|
|
def test_get_info_format(setup_dataframe):
|
|
"""
|
|
We have a common format for the result for dataframe info command.
|
|
As a reference of the format here we take getInfoCommandActions from DSTableCommands
|
|
|
|
print(get_type(initCommandResult))
|
|
print('$NEXT_VALUE_SEPARATOR')
|
|
print(get_shape(initCommandResult))
|
|
print('$NEXT_VALUE_SEPARATOR')
|
|
print(get_head(initCommandResult))
|
|
print('$NEXT_VALUE_SEPARATOR')
|
|
print('$NEXT_VALUE_SEPARATOR')
|
|
print(get_column_types(initCommandResult))
|
|
|
|
Here we check that with pandas_tables_helpers methods can compose expected result
|
|
|
|
TODO: we also should check this format for pydevd_tables.exec_table_command
|
|
TODO: actually, the format is different: in one case we have \n, in other just ''
|
|
:param setup_dataframe: fixture/data for the test, dataframe
|
|
"""
|
|
_, df, _, _, _ = setup_dataframe
|
|
|
|
# remove "dates" column from df because it uses "now" timestamp for data generating
|
|
df = df.drop(columns=['dates', 'interval'])
|
|
|
|
actual = [pandas_tables_helpers.get_type(df),
|
|
NEXT_VALUE_SEPARATOR,
|
|
pandas_tables_helpers.get_shape(df),
|
|
NEXT_VALUE_SEPARATOR,
|
|
pandas_tables_helpers.get_head(df),
|
|
NEXT_VALUE_SEPARATOR,
|
|
pandas_tables_helpers.get_column_types(df)]
|
|
actual = '\n'.join(actual)
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/getInfo_result.txt'
|
|
)
|
|
|
|
|
|
# 8
|
|
@pytest.mark.skipif(sys.version_info < (3, 0), reason="Different format for Python2")
|
|
def test_describe_many_columns_check_html(setup_dataframe_many_columns):
|
|
df = setup_dataframe_many_columns
|
|
actual = pandas_tables_helpers.get_column_descriptions(df)
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/dataframe_many_columns_describe_after.txt'
|
|
)
|
|
|
|
|
|
# 9
|
|
def test_describe_shape_numeric_types(setup_dataframe_many_columns):
|
|
df = setup_dataframe_many_columns
|
|
describe_df = pandas_tables_helpers.__get_describe(df)
|
|
|
|
# for dataframes with only numeric types in columns we have 10 statistics
|
|
assert describe_df.shape[0] == 10
|
|
# the number of columns should be the same
|
|
assert describe_df.shape[1] == df.shape[1]
|
|
|
|
|
|
# 10
|
|
def test_describe_shape_all_types(setup_dataframe):
|
|
_, df, _, _, _ = setup_dataframe
|
|
|
|
if sys.version_info < (3, 0):
|
|
df = df.drop(columns=['lists', 'dicts', 'tuples'])
|
|
|
|
describe_df = pandas_tables_helpers.__get_describe(df)
|
|
# for dataframes with different types in columns we have 13/15 statistics
|
|
if sys.version_info < (3, 0):
|
|
# python2 have 2 additional statistics that we don't use: first and last
|
|
assert describe_df.shape[0] == 15
|
|
else:
|
|
assert describe_df.shape[0] == 13
|
|
# the number of columns should be the same
|
|
assert describe_df.shape[1] == df.shape[1]
|
|
# check that we don't exclude any column from the describe function
|
|
assert len(describe_df.columns[describe_df.isna().all()].tolist()) == 0
|
|
|
|
|
|
# 11
|
|
def test_get_describe_save_columns(setup_dataframe):
|
|
_, df, _, _, _ = setup_dataframe
|
|
|
|
if sys.version_info < (3, 0):
|
|
df = df.drop(columns=['lists', 'dicts', 'tuples'])
|
|
|
|
describe_df = pandas_tables_helpers.__get_describe(df)
|
|
original_columns, describe_columns = df.columns.tolist(), describe_df.columns.tolist()
|
|
|
|
# the number of columns is the same in described and in original
|
|
assert len(original_columns) == len(describe_columns)
|
|
|
|
# compare columns and it's order
|
|
for expected, actual in zip(original_columns, describe_columns):
|
|
assert expected == actual
|
|
|
|
|
|
# 12
|
|
def test_get_describe_returned_types(setup_dataframe):
|
|
_, df, _, _, _ = setup_dataframe
|
|
|
|
if sys.version_info < (3, 0):
|
|
df = df.drop(columns=['lists', 'dicts', 'tuples'])
|
|
|
|
assert type(pandas_tables_helpers.__get_describe(df)) == pd.DataFrame
|
|
assert type(pandas_tables_helpers.__get_describe(df['A'])) == pd.Series
|
|
|
|
|
|
# 13
|
|
@pytest.mark.skipif(sys.version_info < (3, 0), reason="Different format for Python2")
|
|
def test_describe_series(setup_dataframe):
|
|
_, df, _, _, _ = setup_dataframe
|
|
|
|
resulted = ""
|
|
|
|
for column in df:
|
|
# we skip dates column because its data every time is different
|
|
if column != 'dates' and column != 'interval':
|
|
described_series = pandas_tables_helpers.__get_describe(df[column])
|
|
if described_series is not None:
|
|
resulted += str(described_series.to_dict()) + "\n"
|
|
else:
|
|
resulted += "\n"
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=resulted,
|
|
expected_file='test_data/pandas/' + test_data_dir + '/series_describe.txt'
|
|
)
|
|
|
|
|
|
# 14
|
|
@pytest.mark.skipif(sys.version_info < (3, 0),
|
|
reason="The exception will be raised during df creation in Python2")
|
|
def test_overflow_error_is_caught(setup_df_with_big_int_values):
|
|
df = setup_df_with_big_int_values
|
|
actual_result = pandas_tables_helpers.__get_describe(df)
|
|
assert actual_result is not None
|
|
expected_result = pd.Series(data={"count": 2}, index=["count"], name="BitIntValues")
|
|
assert (actual_result.values == expected_result.values).all()
|
|
|
|
|
|
# 15
|
|
def test_vis_data_detecting_column_type(setup_dataframe):
|
|
_, df, _, _, col_name_to_data_type = setup_dataframe
|
|
for column in df.columns:
|
|
col_type = df[column].dtype
|
|
if col_name_to_data_type[column] == TYPE_BOOL:
|
|
assert pandas_tables_helpers.__is_boolean(col_type) == True, "column is %s, col_type is %s, col_type_kind is %s" % (column, col_type, col_type.kind)
|
|
assert pandas_tables_helpers.__is_categorical(df[column], col_type) == False, "column is %s, col_type is %s, col_type_kind is %s" % (column, col_type, col_type.kind)
|
|
assert pandas_tables_helpers.__is_numeric(col_type) == False, "column is %s, col_type is %s, col_type_kind is %s" % (column, col_type, col_type.kind)
|
|
elif col_name_to_data_type[column] == TYPE_NUMERIC:
|
|
assert pandas_tables_helpers.__is_boolean(col_type) == False, "column is %s, col_type is %s, col_type_kind is %s" % (column, col_type, col_type.kind)
|
|
assert pandas_tables_helpers.__is_categorical(df[column], col_type) == False, "column is %s, col_type is %s, col_type_kind is %s" % (column, col_type, col_type.kind)
|
|
assert pandas_tables_helpers.__is_numeric(col_type) == True, "column is %s, col_type is %s, col_type_kind is %s" % (column, col_type, col_type.kind)
|
|
elif col_name_to_data_type[column] == TYPE_CATEGORICAL:
|
|
assert pandas_tables_helpers.__is_boolean(col_type) == False, "column is %s, col_type is %s, col_type_kind is %s" % (column, col_type, col_type.kind)
|
|
assert pandas_tables_helpers.__is_categorical(df[column], col_type) == True, "column is %s, col_type is %s, col_type_kind is %s" % (column, col_type, col_type.kind)
|
|
assert pandas_tables_helpers.__is_numeric(col_type) == False, "column is %s, col_type is %s, col_type_kind is %s" % (column, col_type, col_type.kind)
|
|
|
|
|
|
# 16
|
|
def test_vis_data_integer_columns_simple():
|
|
test_data = pd.DataFrame({"ints": list(range(10)) + list(range(10))})
|
|
actual = pandas_tables_helpers.get_value_occurrences_count(test_data)
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/vis_data_integer_simple.txt'
|
|
)
|
|
|
|
|
|
# 17
|
|
@pytest.mark.skipif(sys.version_info < (3, 0),reason="")
|
|
def test_vis_data_integer_columns_with_bins():
|
|
test_data = pd.DataFrame({"ints": list(range(21)) + list(range(21))})
|
|
actual = pandas_tables_helpers.get_value_occurrences_count(test_data)
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/vis_data_integer_with_bins.txt'
|
|
)
|
|
|
|
|
|
# 18
|
|
@pytest.mark.skipif(sys.version_info < (3, 0),reason="")
|
|
def test_vis_data_float_columns_simple():
|
|
import numpy as np
|
|
test_data = pd.DataFrame({"floats": np.arange(0, 1, 0.1)})
|
|
actual = pandas_tables_helpers.get_value_occurrences_count(test_data)
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/vis_data_float_simple.txt'
|
|
)
|
|
|
|
|
|
# 19
|
|
@pytest.mark.skipif(sys.version_info < (3, 0),reason="")
|
|
def test_vis_data_float_columns_with_bins():
|
|
import numpy as np
|
|
test_data = pd.DataFrame({"floats": np.arange(0, 3, 0.1)})
|
|
actual = pandas_tables_helpers.get_value_occurrences_count(test_data)
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/vis_data_float_with_bins.txt'
|
|
)
|
|
|
|
|
|
# 20
|
|
def test_vis_data_bool_column():
|
|
test_data_bool = pd.DataFrame({"bools": [True] * 50 + [False] * 25})
|
|
actual = pandas_tables_helpers.get_value_occurrences_count(test_data_bool)
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/vis_data_bool_column.txt'
|
|
)
|
|
|
|
|
|
# 21
|
|
def test_vis_data_bool_with_nan_column():
|
|
test_data_bool = pd.DataFrame({"bools": [True] * 50 + [False] * 25 + [None] * 10})
|
|
actual = pandas_tables_helpers.get_value_occurrences_count(test_data_bool)
|
|
if test_data_dir.startswith('python_2'):
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/python_2_7/vis_data_bool_with_nan_column.txt'
|
|
)
|
|
else:
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/vis_data_bool_with_nan_column.txt'
|
|
)
|
|
|
|
|
|
|
|
# 22
|
|
def test_vis_data_categorical_column_percentage():
|
|
test_data_str = pd.DataFrame({"strs": ["First"] * 50 + ["Second"] * 25})
|
|
actual = pandas_tables_helpers.get_value_occurrences_count(test_data_str)
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/vis_data_categorical_column.txt'
|
|
)
|
|
|
|
|
|
# 23
|
|
def test_vis_data_categorical_column_other():
|
|
test_data_str_other = pd.DataFrame({"strs": ["First"] * 50 + ["Second"] * 25 + ["Third"] * 10 + ["Forth"] * 5})
|
|
actual = pandas_tables_helpers.get_value_occurrences_count(test_data_str_other)
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/vis_data_categorical_column_other.txt'
|
|
)
|
|
|
|
|
|
# 24
|
|
def test_vis_data_categorical_column_unique():
|
|
test_data_str_unique = pd.DataFrame({"strs": [str(i) for i in range(1000)]})
|
|
actual = pandas_tables_helpers.get_value_occurrences_count(test_data_str_unique)
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/vis_data_categorical_column_unique.txt'
|
|
)
|
|
|
|
|
|
# 25
|
|
def test_vis_data_categorical_column_switch_perc_to_unique():
|
|
# we need a column with 49% of unique values
|
|
test_data_other = pd.DataFrame({"str": [str(i) for i in range(49)] + ["48"] * 51})
|
|
assert pandas_tables_helpers.ColumnVisualisationType.PERCENTAGE in pandas_tables_helpers.get_value_occurrences_count(test_data_other)
|
|
|
|
# if the share of unique is greater than 50% then we should show "UNIQUE" vis
|
|
test_data_unique = pd.DataFrame({"str": [str(i) for i in range(52)] + ["51"] * 49})
|
|
assert pandas_tables_helpers.ColumnVisualisationType.UNIQUE in pandas_tables_helpers.get_value_occurrences_count(test_data_unique)
|
|
|
|
|
|
# 26
|
|
def test_define_format_function():
|
|
assert pandas_tables_helpers.__define_format_function(None) is None
|
|
assert pandas_tables_helpers.__define_format_function('null') is None
|
|
assert pandas_tables_helpers.__define_format_function('garbage') is None
|
|
assert pandas_tables_helpers.__define_format_function(1) is None
|
|
|
|
format_to_result = {
|
|
"%.2f": (1.1, "1.10"),
|
|
"%.12f": (1.1, "1.100000000000"),
|
|
"%.2e": (1.1, "1.10e+00"),
|
|
"%d": (1.1, "1"),
|
|
"%d garbage": (1.1, "1 garbage"),
|
|
}
|
|
for format_str, (float_value, expected_result) in format_to_result.items():
|
|
formatter = pandas_tables_helpers.__define_format_function(format_str)
|
|
assert formatter is not None
|
|
assert callable(formatter)
|
|
assert formatter(float_value) == expected_result
|
|
|
|
|
|
# 27
|
|
def test_get_tables_display_options():
|
|
max_cols, max_colwidth, max_rows = pandas_tables_helpers.__get_tables_display_options()
|
|
assert max_cols is None
|
|
assert max_rows is None
|
|
if sys.version_info < (3, 0) or int(pd.__version__.split('.')[0]) < 1:
|
|
assert max_colwidth == pandas_tables_helpers.MAX_COLWIDTH
|
|
else:
|
|
assert max_colwidth is None
|
|
|
|
|
|
# 28
|
|
def test_get_data_float_values_2f(setup_dataframe_with_float_values):
|
|
df = setup_dataframe_with_float_values
|
|
actual = pandas_tables_helpers.get_data(df, False, 0, 3, format="%.2f")
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/' + test_data_dir + '/get_data_float_values_2f.txt'
|
|
)
|
|
|
|
|
|
# 29
|
|
def test_get_data_float_values_12f(setup_dataframe_with_float_values):
|
|
df = setup_dataframe_with_float_values
|
|
actual = pandas_tables_helpers.get_data(df, False, 0, 3, format="%.12f")
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/' + test_data_dir + '/get_data_float_values_12f.txt'
|
|
)
|
|
|
|
|
|
# 30
|
|
def test_get_data_float_values_2e(setup_dataframe_with_float_values):
|
|
df = setup_dataframe_with_float_values
|
|
actual = pandas_tables_helpers.get_data(df, False, 0, 3, format="%.2e")
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/' + test_data_dir + '/get_data_float_values_2e.txt'
|
|
)
|
|
|
|
|
|
# 31
|
|
@pytest.mark.skipif(sys.version_info < (3, 0), reason="%d doesn't work with np.float('nan')")
|
|
def test_get_data_float_values_d(setup_dataframe_with_float_values):
|
|
df = setup_dataframe_with_float_values
|
|
actual = pandas_tables_helpers.get_data(df, False, 0, 3, format="%d")
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/' + test_data_dir + '/get_data_float_values_d.txt'
|
|
)
|
|
|
|
|
|
# 32
|
|
@pytest.mark.skipif(sys.version_info < (3, 0), reason="%d doesn't work with np.float('nan')")
|
|
def test_get_data_float_values_d_garbage(setup_dataframe_with_float_values):
|
|
df = setup_dataframe_with_float_values
|
|
actual = pandas_tables_helpers.get_data(df, False, 0, 3, format="%d garbage")
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=actual,
|
|
expected_file='test_data/pandas/' + test_data_dir + '/get_data_float_values_d_garbage.txt'
|
|
)
|
|
|
|
|
|
# 33
|
|
def test_display_data_html_df(mocker, setup_dataframe):
|
|
_, df, _, _, _ = setup_dataframe
|
|
df = df.drop(columns=['dates'])
|
|
|
|
# Mock the HTML and display functions
|
|
mock_display = mocker.patch('IPython.display.display')
|
|
pandas_tables_helpers.display_data_html(df, 0, 16)
|
|
called_args, called_kwargs = mock_display.call_args
|
|
displayed_html = called_args[0]
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=displayed_html.data,
|
|
expected_file='test_data/pandas/' + test_data_dir + '/display_data_html_df.txt'
|
|
)
|
|
|
|
|
|
# 34
|
|
def test_display_data_csv_df(mocker, setup_dataframe):
|
|
_, df, _, _, _ = setup_dataframe
|
|
df = df.drop(columns=['dates'])
|
|
|
|
# Mock the CSV and display functions
|
|
mock_print = mocker.patch('sys.stdout', new_callable=StringIO)
|
|
pandas_tables_helpers.display_data_csv(df, 0, 16)
|
|
displayed_csv = mock_print.getvalue()
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=displayed_csv,
|
|
expected_file='test_data/pandas/' + test_data_dir + '/display_data_csv_df.txt'
|
|
)
|
|
|
|
|
|
# 35
|
|
def test_display_data_html_df_with_float_values(mocker, setup_dataframe_with_float_values):
|
|
df = setup_dataframe_with_float_values
|
|
|
|
# Mock the HTML and display functions
|
|
mock_display = mocker.patch('IPython.display.display')
|
|
pandas_tables_helpers.display_data_html(df, 0, 3)
|
|
called_args, called_kwargs = mock_display.call_args
|
|
displayed_html = called_args[0]
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=displayed_html.data,
|
|
expected_file='test_data/pandas/' + test_data_dir + '/display_data_html_df_with_float_values.txt'
|
|
)
|
|
|
|
|
|
# 36
|
|
def test_display_data_csv_df_with_float_values(mocker, setup_dataframe_with_float_values):
|
|
df = setup_dataframe_with_float_values
|
|
|
|
# Mock the CSV and display functions
|
|
mock_print = mocker.patch('sys.stdout', new_callable=StringIO)
|
|
pandas_tables_helpers.display_data_csv(df, 0, 3)
|
|
displayed_csv = mock_print.getvalue()
|
|
|
|
__read_expected_from_file_and_compare_with_actual(
|
|
actual=displayed_csv,
|
|
expected_file='test_data/pandas/' + test_data_dir + '/display_data_csv_df_with_float_values.txt'
|
|
)
|
|
|
|
|
|
def __prepare_describe_result(described_str):
|
|
"""
|
|
This function is needed with the aim not to be depended on the python version,
|
|
there is different indentation in different python versions.
|
|
We check only the data, not the indentation.
|
|
"""
|
|
# type: (str) -> (str)
|
|
result = []
|
|
for line in described_str.split("\n"):
|
|
result.append(" ".join(line.split()))
|
|
|
|
return "\n".join(result)
|
|
|
|
|
|
def __read_expected_from_file_and_compare_with_actual(actual, expected_file):
|
|
with open(expected_file, 'r') as in_f:
|
|
expected = in_f.read()
|
|
assert len(expected) > 0, "The expected file is empty"
|
|
|
|
# for a more convenient assertion fails messages here we compare string char by char
|
|
for ind, (act, exp) in enumerate(zip(actual, expected)):
|
|
assert act == exp, "\nindex is %s \n\n act part = %s \n\n exp part = %s\n" % (ind, actual, expected)
|