216 lines
		
	
	
		
			6.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			216 lines
		
	
	
		
			6.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Tests that work on both the Python and C engines but do not have a
 | 
						|
specific classification into the other test modules.
 | 
						|
"""
 | 
						|
from io import StringIO
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
from pandas import (
 | 
						|
    DataFrame,
 | 
						|
    Series,
 | 
						|
)
 | 
						|
import pandas._testing as tm
 | 
						|
 | 
						|
# GH#43650: Some expected failures with the pyarrow engine can occasionally
 | 
						|
# cause a deadlock instead, so we skip these instead of xfailing
 | 
						|
skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
 | 
						|
 | 
						|
 | 
						|
def test_int_conversion(all_parsers):
 | 
						|
    data = """A,B
 | 
						|
1.0,1
 | 
						|
2.0,2
 | 
						|
3.0,3
 | 
						|
"""
 | 
						|
    parser = all_parsers
 | 
						|
    result = parser.read_csv(StringIO(data))
 | 
						|
 | 
						|
    expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"])
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "data,kwargs,expected",
 | 
						|
    [
 | 
						|
        (
 | 
						|
            "A,B\nTrue,1\nFalse,2\nTrue,3",
 | 
						|
            {},
 | 
						|
            DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
 | 
						|
        ),
 | 
						|
        (
 | 
						|
            "A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3",
 | 
						|
            {"true_values": ["yes", "Yes", "YES"], "false_values": ["no", "NO", "No"]},
 | 
						|
            DataFrame(
 | 
						|
                [[True, 1], [False, 2], [True, 3], [False, 3], [True, 3]],
 | 
						|
                columns=["A", "B"],
 | 
						|
            ),
 | 
						|
        ),
 | 
						|
        (
 | 
						|
            "A,B\nTRUE,1\nFALSE,2\nTRUE,3",
 | 
						|
            {},
 | 
						|
            DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"]),
 | 
						|
        ),
 | 
						|
        (
 | 
						|
            "A,B\nfoo,bar\nbar,foo",
 | 
						|
            {"true_values": ["foo"], "false_values": ["bar"]},
 | 
						|
            DataFrame([[True, False], [False, True]], columns=["A", "B"]),
 | 
						|
        ),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_parse_bool(all_parsers, data, kwargs, expected):
 | 
						|
    parser = all_parsers
 | 
						|
    result = parser.read_csv(StringIO(data), **kwargs)
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_parse_integers_above_fp_precision(all_parsers):
 | 
						|
    data = """Numbers
 | 
						|
17007000002000191
 | 
						|
17007000002000191
 | 
						|
17007000002000191
 | 
						|
17007000002000191
 | 
						|
17007000002000192
 | 
						|
17007000002000192
 | 
						|
17007000002000192
 | 
						|
17007000002000192
 | 
						|
17007000002000192
 | 
						|
17007000002000194"""
 | 
						|
    parser = all_parsers
 | 
						|
    result = parser.read_csv(StringIO(data))
 | 
						|
    expected = DataFrame(
 | 
						|
        {
 | 
						|
            "Numbers": [
 | 
						|
                17007000002000191,
 | 
						|
                17007000002000191,
 | 
						|
                17007000002000191,
 | 
						|
                17007000002000191,
 | 
						|
                17007000002000192,
 | 
						|
                17007000002000192,
 | 
						|
                17007000002000192,
 | 
						|
                17007000002000192,
 | 
						|
                17007000002000192,
 | 
						|
                17007000002000194,
 | 
						|
            ]
 | 
						|
        }
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@skip_pyarrow  # Flaky
 | 
						|
@pytest.mark.parametrize("sep", [" ", r"\s+"])
 | 
						|
def test_integer_overflow_bug(all_parsers, sep):
 | 
						|
    # see gh-2601
 | 
						|
    data = "65248E10 11\n55555E55 22\n"
 | 
						|
    parser = all_parsers
 | 
						|
 | 
						|
    result = parser.read_csv(StringIO(data), header=None, sep=sep)
 | 
						|
    expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]])
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_int64_min_issues(all_parsers):
 | 
						|
    # see gh-2599
 | 
						|
    parser = all_parsers
 | 
						|
    data = "A,B\n0,0\n0,"
 | 
						|
    result = parser.read_csv(StringIO(data))
 | 
						|
 | 
						|
    expected = DataFrame({"A": [0, 0], "B": [0, np.nan]})
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@skip_pyarrow
 | 
						|
@pytest.mark.parametrize("conv", [None, np.int64, np.uint64])
 | 
						|
def test_int64_overflow(all_parsers, conv):
 | 
						|
    data = """ID
 | 
						|
00013007854817840016671868
 | 
						|
00013007854817840016749251
 | 
						|
00013007854817840016754630
 | 
						|
00013007854817840016781876
 | 
						|
00013007854817840017028824
 | 
						|
00013007854817840017963235
 | 
						|
00013007854817840018860166"""
 | 
						|
    parser = all_parsers
 | 
						|
 | 
						|
    if conv is None:
 | 
						|
        # 13007854817840016671868 > UINT64_MAX, so this
 | 
						|
        # will overflow and return object as the dtype.
 | 
						|
        result = parser.read_csv(StringIO(data))
 | 
						|
        expected = DataFrame(
 | 
						|
            [
 | 
						|
                "00013007854817840016671868",
 | 
						|
                "00013007854817840016749251",
 | 
						|
                "00013007854817840016754630",
 | 
						|
                "00013007854817840016781876",
 | 
						|
                "00013007854817840017028824",
 | 
						|
                "00013007854817840017963235",
 | 
						|
                "00013007854817840018860166",
 | 
						|
            ],
 | 
						|
            columns=["ID"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
    else:
 | 
						|
        # 13007854817840016671868 > UINT64_MAX, so attempts
 | 
						|
        # to cast to either int64 or uint64 will result in
 | 
						|
        # an OverflowError being raised.
 | 
						|
        msg = (
 | 
						|
            "(Python int too large to convert to C long)|"
 | 
						|
            "(long too big to convert)|"
 | 
						|
            "(int too big to convert)"
 | 
						|
        )
 | 
						|
 | 
						|
        with pytest.raises(OverflowError, match=msg):
 | 
						|
            parser.read_csv(StringIO(data), converters={"ID": conv})
 | 
						|
 | 
						|
 | 
						|
@skip_pyarrow
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "val", [np.iinfo(np.uint64).max, np.iinfo(np.int64).max, np.iinfo(np.int64).min]
 | 
						|
)
 | 
						|
def test_int64_uint64_range(all_parsers, val):
 | 
						|
    # These numbers fall right inside the int64-uint64
 | 
						|
    # range, so they should be parsed as string.
 | 
						|
    parser = all_parsers
 | 
						|
    result = parser.read_csv(StringIO(str(val)), header=None)
 | 
						|
 | 
						|
    expected = DataFrame([val])
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@skip_pyarrow
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "val", [np.iinfo(np.uint64).max + 1, np.iinfo(np.int64).min - 1]
 | 
						|
)
 | 
						|
def test_outside_int64_uint64_range(all_parsers, val):
 | 
						|
    # These numbers fall just outside the int64-uint64
 | 
						|
    # range, so they should be parsed as string.
 | 
						|
    parser = all_parsers
 | 
						|
    result = parser.read_csv(StringIO(str(val)), header=None)
 | 
						|
 | 
						|
    expected = DataFrame([str(val)])
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@skip_pyarrow
 | 
						|
@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], [str(2**63), str(-1)]])
 | 
						|
def test_numeric_range_too_wide(all_parsers, exp_data):
 | 
						|
    # No numerical dtype can hold both negative and uint64
 | 
						|
    # values, so they should be cast as string.
 | 
						|
    parser = all_parsers
 | 
						|
    data = "\n".join(exp_data)
 | 
						|
    expected = DataFrame(exp_data)
 | 
						|
 | 
						|
    result = parser.read_csv(StringIO(data), header=None)
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_integer_precision(all_parsers):
 | 
						|
    # Gh 7072
 | 
						|
    s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
 | 
						|
5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389"""
 | 
						|
    parser = all_parsers
 | 
						|
    result = parser.read_csv(StringIO(s), header=None)[4]
 | 
						|
    expected = Series([4321583677327450765, 4321113141090630389], name=4)
 | 
						|
    tm.assert_series_equal(result, expected)
 |