Files
openide/python/helpers/pydev/pydev_tests_tables/test_pydev_pandas.py
Natalia Murycheva 2a97339f9c [PyCharm Tables] PY-77129 fixed tables python tests
* fixed 3 configurations when we don't install pandas
* fixed tests where the changing tests was missed after implementation changes
* changed python 3.7 to python 3.12

Merge-request: IJ-MR-149361
Merged-by: Natalia Murycheva <natalia.murycheva@jetbrains.com>
(cherry picked from commit f154e99b8986a3f476bd7424d34e0b894991fb33)

GitOrigin-RevId: 9f5b92d8cb3937b59c491018bc1cb3d9b5ecf539
2024-11-18 15:26:45 +00:00

370 lines
14 KiB
Python

# Copyright 2000-2023 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
"""
Here we test aux methods for pandas tables handling, namely,
check functions from _pydevd_bundle.tables.pydevd_pandas module.
"""
import pandas as pd
import pytest
import sys
import _pydevd_bundle.tables.pydevd_pandas as pandas_tables_helpers
from _pydevd_bundle.pydevd_constants import NEXT_VALUE_SEPARATOR
@pytest.fixture
def setup_dataframe():
"""
Here we create a fixture for tests that are related to DataFrames.
Also, we create other auxiliary data
"""
rows_number = 4
df = pd.DataFrame(
{
"A": 1.0,
"B": "foo",
"C": [None] * rows_number,
"D": [1 + 20j] * rows_number,
"E": [1 + 20j] * rows_number,
"F": [None, "bar", 2., 1 + 10j],
"G": [None, "bar", 2., 1 + 10j],
"H": [True, False] * (rows_number // 2),
"I": pd.Timestamp("20130102"),
"J": pd.Series(1, index=list(range(rows_number)),
dtype="float32"),
"K": pd.Series(range(rows_number),
index=list(range(rows_number)),
dtype="int32"),
"L": pd.Categorical(["test", "train"] * (rows_number // 2)),
"dates": pd.date_range("now", periods=rows_number),
"datetime64[ns]": pd.Timestamp("20010102"),
"datetime64[ns, <tz>]": pd.date_range("20130101", periods=rows_number,
tz="CET"),
"period": pd.Period('2012-1-1', freq='D'),
"category": pd.Series(list("ABCD")).astype("category"),
"interval": pd.interval_range(start=pd.Timestamp("2017-01-01"),
periods=rows_number, freq="W"),
}
)
df['datetime64[ns]'] = df['datetime64[ns]'].astype("datetime64[ns]")
df['I'] = df['I'].astype("datetime64[ns]")
df_html = repr(df.head(1).to_html(notebook=True, max_cols=None))
columns_types = [str(df.index.dtype)] + [str(t) for t in df.dtypes]
return rows_number, df, df_html, columns_types
@pytest.fixture
def setup_series_no_names():
"""
Here we create a fixture for tests that are related to Series without a name.
"""
return pd.Series([1, 2, 3])
@pytest.fixture
def setup_dataframe_many_columns():
"""
Here we create a fixture for tests that are related to DataFrames.
We check that we don't miss columns for big dataframes
"""
return pd.read_csv('test_data/pandas/dataframe_many_columns_before.csv')
@pytest.fixture
def setup_df_with_big_int_values():
"""
Here we create a fixture for one test.
With that df we check that we catch OverflowError exception in the describe functions.
This number has to be so big.
"""
big_int = 555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555
df = pd.DataFrame({"BitIntValues": [1, 2]})
df["BitIntValues"] = big_int
return df
def test_info_command(setup_dataframe):
"""
Here we check the correctness of info command that is invoked via Kotlin.
:param setup_dataframe: fixture/data for the test
"""
rows, df, df_html, cols_types_expected = setup_dataframe
cols_types_actual = pandas_tables_helpers.get_column_types(df)
cols_types_actual = cols_types_actual.split(pandas_tables_helpers.TABLE_TYPE_NEXT_VALUE_SEPARATOR)
assert pandas_tables_helpers.get_type(df) == str(pd.DataFrame)
assert pandas_tables_helpers.get_shape(df) == str(rows)
assert pandas_tables_helpers.get_head(df) == df_html
assert cols_types_actual == cols_types_expected
def test_get_data_saves_display_options(setup_dataframe):
"""
We check that we don't ruin a user's display options.
:param setup_dataframe: fixture/data for the test
"""
_, df, _, _ = setup_dataframe
max_columns_before = pd.get_option('display.max_columns')
max_colwidth_before = pd.get_option('display.max_colwidth')
max_rows_before = pd.get_option('display.max_rows')
pandas_tables_helpers.get_data(df, False)
max_columns_after = pd.get_option('display.max_columns')
max_colwidth_after = pd.get_option('display.max_colwidth')
max_rows_after = pd.get_option('display.max_rows')
assert max_columns_before == max_columns_after
assert max_colwidth_before == max_colwidth_after
assert max_rows_before == max_rows_after
def test_display_saves_display_options(setup_dataframe):
"""
We check that we don't ruin a user's display options.
:param setup_dataframe: fixture/data for the test
"""
_, df, _, _ = setup_dataframe
max_columns_before = pd.get_option('display.max_columns')
max_colwidth_before = pd.get_option('display.max_colwidth')
max_rows_before = pd.get_option('display.max_rows')
pandas_tables_helpers.display_data_html(df, start_index=0, end_index=2)
max_columns_after = pd.get_option('display.max_columns')
max_colwidth_after = pd.get_option('display.max_colwidth')
max_rows_after = pd.get_option('display.max_rows')
assert max_columns_before == max_columns_after
assert max_colwidth_before == max_colwidth_after
assert max_rows_before == max_rows_after
def test_convert_to_df_unnamed_series(setup_series_no_names):
"""
In this test we check two methods: __convert_to_df and __get_column_name.
For unnamed pd.Series case.
:param setup_series_no_names: fixture/data for the test
"""
converted_series = pandas_tables_helpers.__convert_to_df(setup_series_no_names)
assert isinstance(converted_series, pd.DataFrame)
assert converted_series.columns[0] == '<unnamed>'
def test_convert_to_df_common_series(setup_dataframe):
"""
In this test we check two methods: __convert_to_df and __get_column_name.
For a common pd.Series case.
:param setup_dataframe: fixture/data for the test
"""
_, df, _, _ = setup_dataframe
for col in df.columns:
converted_series = pandas_tables_helpers.__convert_to_df(df[col])
assert isinstance(converted_series, pd.DataFrame)
assert converted_series.columns[0] == col
@pytest.mark.skipif(sys.version_info < (3, 0),
reason="TODO: investigate pd.Categorical/complex cases")
def test_get_info_format(setup_dataframe):
"""
We have a common format for the result for dataframe info command.
As a reference of the format here we take getInfoCommandActions from DSTableCommands
print(get_type(initCommandResult))
print('$NEXT_VALUE_SEPARATOR')
print(get_shape(initCommandResult))
print('$NEXT_VALUE_SEPARATOR')
print(get_head(initCommandResult))
print('$NEXT_VALUE_SEPARATOR')
print('$NEXT_VALUE_SEPARATOR')
print(get_column_types(initCommandResult))
Here we check that with pandas_tables_helpers methods can compose expected result
TODO: we also should check this format for pydevd_tables.exec_table_command
TODO: actually, the format is different: in one case we have \n, in other just ''
:param setup_dataframe: fixture/data for the test, dataframe
"""
_, df, _, _ = setup_dataframe
# remove "dates" column from df because it uses "now" timestamp for data generating
df = df.drop(columns=['dates', 'interval'])
actual = [pandas_tables_helpers.get_type(df),
NEXT_VALUE_SEPARATOR,
pandas_tables_helpers.get_shape(df),
NEXT_VALUE_SEPARATOR,
pandas_tables_helpers.get_head(df),
NEXT_VALUE_SEPARATOR,
pandas_tables_helpers.get_column_types(df)]
actual = '\n'.join(actual)
print("GET INFO: START")
print()
pandas_tables_helpers.get_head(df)
print()
print("GET INFO: END")
print()
read_expected_from_file_and_compare_with_actual(
actual=actual,
expected_file='test_data/pandas/getInfo_result.txt'
)
@pytest.mark.skipif(sys.version_info < (3, 0), reason="Different format for Python2")
def test_describe_many_columns_check_html(setup_dataframe_many_columns):
df = setup_dataframe_many_columns
actual = pandas_tables_helpers.get_column_descriptions(df)
read_expected_from_file_and_compare_with_actual(
actual=actual,
expected_file='test_data/pandas/dataframe_many_columns_describe_after.txt'
)
def test_describe_shape_numeric_types(setup_dataframe_many_columns):
df = setup_dataframe_many_columns
describe_df = pandas_tables_helpers.__get_describe(df)
# for dataframes with only numeric types in columns we have 10 statistics
assert describe_df.shape[0] == 10
# the number of columns should be the same
assert describe_df.shape[1] == describe_df.shape[1]
def test_describe_shape_all_types(setup_dataframe):
_, df, _, _ = setup_dataframe
describe_df = pandas_tables_helpers.__get_describe(df)
# for dataframes with different types in columns we have 13/15 statistics
if sys.version_info < (3, 0):
# python2 have 2 additional statistics that we don't use: first and last
assert describe_df.shape[0] == 15
else:
assert describe_df.shape[0] == 13
# the number of columns should be the same
assert describe_df.shape[1] == df.shape[1]
# check that we excluded only 2 columns from describe
assert len(describe_df.columns[describe_df.isna().all()].tolist()) == 2
def test_get_describe_save_columns(setup_dataframe):
_, df, _, _ = setup_dataframe
describe_df = pandas_tables_helpers.__get_describe(df)
original_columns, describe_columns = df.columns.tolist(), describe_df.columns.tolist()
# the number of columns is the same in described and in original
assert len(original_columns) == len(describe_columns)
# compare columns and it's order
for expected, actual in zip(original_columns, describe_columns):
assert expected == actual
def test_get_describe_returned_types(setup_dataframe):
_, df, _, _ = setup_dataframe
assert type(pandas_tables_helpers.__get_describe(df)) == pd.DataFrame
assert type(pandas_tables_helpers.__get_describe(df['A'])) == pd.Series
@pytest.mark.skipif(sys.version_info < (3, 0), reason="Different format for Python2")
def test_describe_series(setup_dataframe):
_, df, _, _ = setup_dataframe
resulted = ""
for column in df:
# we skip dates column because its data every time is different
if column != 'dates' and column != 'interval':
described_series = pandas_tables_helpers.__get_describe(df[column])
resulted += __prepare_describe_result(str(described_series)) + "\n"
exp_file_python_ver = str(sys.version_info[0]) + '_' + str(sys.version_info[1])
read_expected_from_file_and_compare_with_actual(
actual=resulted,
expected_file='test_data/pandas/series_describe_' + exp_file_python_ver + '.txt'
)
def __prepare_describe_result(described_str):
"""
This function is needed with the aim not to be depended on the python version,
there is different indentation in different python versions.
We check only the data, not the indentation.
"""
# type: (str) -> (str)
result = []
for line in described_str.split("\n"):
result.append(" ".join(line.split()))
return "\n".join(result)
@pytest.mark.skipif(sys.version_info < (3, 0),
reason="The exception will be raised during df creation in Python2")
def test_overflow_error_is_caught(setup_df_with_big_int_values):
df = setup_df_with_big_int_values
assert pandas_tables_helpers.__get_describe(df) is None
def test_vis_data_integer_columns_simple():
test_data = pd.DataFrame({"ints": list(range(10)) + list(range(10))})
actual = pandas_tables_helpers.get_value_occurrences_count(test_data)
read_expected_from_file_and_compare_with_actual(
actual=actual,
expected_file='test_data/pandas/vis_data_integer_simple.txt'
)
@pytest.mark.skipif(sys.version_info < (3, 0),reason="")
def test_vis_data_integer_columns_with_bins():
test_data = pd.DataFrame({"ints": list(range(21)) + list(range(21))})
actual = pandas_tables_helpers.get_value_occurrences_count(test_data)
read_expected_from_file_and_compare_with_actual(
actual=actual,
expected_file='test_data/pandas/vis_data_integer_with_bins.txt'
)
@pytest.mark.skipif(sys.version_info < (3, 0),reason="")
def test_vis_data_float_columns_simple():
import numpy as np
test_data = pd.DataFrame({"floats": np.arange(0, 1, 0.1)})
actual = pandas_tables_helpers.get_value_occurrences_count(test_data)
read_expected_from_file_and_compare_with_actual(
actual=actual,
expected_file='test_data/pandas/vis_data_float_simple.txt'
)
@pytest.mark.skipif(sys.version_info < (3, 0),reason="")
def test_vis_data_float_columns_with_bins():
import numpy as np
test_data = pd.DataFrame({"floats": np.arange(0, 3, 0.1)})
actual = pandas_tables_helpers.get_value_occurrences_count(test_data)
read_expected_from_file_and_compare_with_actual(
actual=actual,
expected_file='test_data/pandas/vis_data_float_with_bins.txt'
)
def read_expected_from_file_and_compare_with_actual(actual, expected_file):
with open(expected_file, 'r') as in_f:
expected = in_f.read()
# for a more convenient assertion fails messages here we compare string char by char
for ind, (act, exp) in enumerate(zip(actual, expected)):
assert act == exp, "index is %s, act part = %s, exp part = %s" % (ind,
actual[max(0, ind - 20): min(len(actual) - 1, ind + 20)],
expected[max(0, ind - 20): min(len(actual) - 1, ind + 20)])