192 lines
		
	
	
		
			4.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			192 lines
		
	
	
		
			4.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Tests column conversion functionality during parsing
 | 
						|
for all of the parsers defined in parsers.py
 | 
						|
"""
 | 
						|
from io import StringIO
 | 
						|
 | 
						|
from dateutil.parser import parse
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas import (
 | 
						|
    DataFrame,
 | 
						|
    Index,
 | 
						|
)
 | 
						|
import pandas._testing as tm
 | 
						|
 | 
						|
pytestmark = pytest.mark.usefixtures("pyarrow_skip")
 | 
						|
 | 
						|
 | 
						|
def test_converters_type_must_be_dict(all_parsers):
 | 
						|
    parser = all_parsers
 | 
						|
    data = """index,A,B,C,D
 | 
						|
foo,2,3,4,5
 | 
						|
"""
 | 
						|
 | 
						|
    with pytest.raises(TypeError, match="Type converters.+"):
 | 
						|
        parser.read_csv(StringIO(data), converters=0)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("column", [3, "D"])
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "converter", [parse, lambda x: int(x.split("/")[2])]  # Produce integer.
 | 
						|
)
 | 
						|
def test_converters(all_parsers, column, converter):
 | 
						|
    parser = all_parsers
 | 
						|
    data = """A,B,C,D
 | 
						|
a,1,2,01/01/2009
 | 
						|
b,3,4,01/02/2009
 | 
						|
c,4,5,01/03/2009
 | 
						|
"""
 | 
						|
    result = parser.read_csv(StringIO(data), converters={column: converter})
 | 
						|
 | 
						|
    expected = parser.read_csv(StringIO(data))
 | 
						|
    expected["D"] = expected["D"].map(converter)
 | 
						|
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_converters_no_implicit_conv(all_parsers):
 | 
						|
    # see gh-2184
 | 
						|
    parser = all_parsers
 | 
						|
    data = """000102,1.2,A\n001245,2,B"""
 | 
						|
 | 
						|
    converters = {0: lambda x: x.strip()}
 | 
						|
    result = parser.read_csv(StringIO(data), header=None, converters=converters)
 | 
						|
 | 
						|
    # Column 0 should not be casted to numeric and should remain as object.
 | 
						|
    expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]])
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_converters_euro_decimal_format(all_parsers):
 | 
						|
    # see gh-583
 | 
						|
    converters = {}
 | 
						|
    parser = all_parsers
 | 
						|
 | 
						|
    data = """Id;Number1;Number2;Text1;Text2;Number3
 | 
						|
1;1521,1541;187101,9543;ABC;poi;4,7387
 | 
						|
2;121,12;14897,76;DEF;uyt;0,3773
 | 
						|
3;878,158;108013,434;GHI;rez;2,7356"""
 | 
						|
    converters["Number1"] = converters["Number2"] = converters[
 | 
						|
        "Number3"
 | 
						|
    ] = lambda x: float(x.replace(",", "."))
 | 
						|
 | 
						|
    result = parser.read_csv(StringIO(data), sep=";", converters=converters)
 | 
						|
    expected = DataFrame(
 | 
						|
        [
 | 
						|
            [1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387],
 | 
						|
            [2, 121.12, 14897.76, "DEF", "uyt", 0.3773],
 | 
						|
            [3, 878.158, 108013.434, "GHI", "rez", 2.7356],
 | 
						|
        ],
 | 
						|
        columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_converters_corner_with_nans(all_parsers):
 | 
						|
    parser = all_parsers
 | 
						|
    data = """id,score,days
 | 
						|
1,2,12
 | 
						|
2,2-5,
 | 
						|
3,,14+
 | 
						|
4,6-12,2"""
 | 
						|
 | 
						|
    # Example converters.
 | 
						|
    def convert_days(x):
 | 
						|
        x = x.strip()
 | 
						|
 | 
						|
        if not x:
 | 
						|
            return np.nan
 | 
						|
 | 
						|
        is_plus = x.endswith("+")
 | 
						|
 | 
						|
        if is_plus:
 | 
						|
            x = int(x[:-1]) + 1
 | 
						|
        else:
 | 
						|
            x = int(x)
 | 
						|
 | 
						|
        return x
 | 
						|
 | 
						|
    def convert_days_sentinel(x):
 | 
						|
        x = x.strip()
 | 
						|
 | 
						|
        if not x:
 | 
						|
            return np.nan
 | 
						|
 | 
						|
        is_plus = x.endswith("+")
 | 
						|
 | 
						|
        if is_plus:
 | 
						|
            x = int(x[:-1]) + 1
 | 
						|
        else:
 | 
						|
            x = int(x)
 | 
						|
 | 
						|
        return x
 | 
						|
 | 
						|
    def convert_score(x):
 | 
						|
        x = x.strip()
 | 
						|
 | 
						|
        if not x:
 | 
						|
            return np.nan
 | 
						|
 | 
						|
        if x.find("-") > 0:
 | 
						|
            val_min, val_max = map(int, x.split("-"))
 | 
						|
            val = 0.5 * (val_min + val_max)
 | 
						|
        else:
 | 
						|
            val = float(x)
 | 
						|
 | 
						|
        return val
 | 
						|
 | 
						|
    results = []
 | 
						|
 | 
						|
    for day_converter in [convert_days, convert_days_sentinel]:
 | 
						|
        result = parser.read_csv(
 | 
						|
            StringIO(data),
 | 
						|
            converters={"score": convert_score, "days": day_converter},
 | 
						|
            na_values=["", None],
 | 
						|
        )
 | 
						|
        assert pd.isna(result["days"][1])
 | 
						|
        results.append(result)
 | 
						|
 | 
						|
    tm.assert_frame_equal(results[0], results[1])
 | 
						|
 | 
						|
 | 
						|
def test_converter_index_col_bug(all_parsers):
 | 
						|
    # see gh-1835
 | 
						|
    parser = all_parsers
 | 
						|
    data = "A;B\n1;2\n3;4"
 | 
						|
 | 
						|
    rs = parser.read_csv(
 | 
						|
        StringIO(data), sep=";", index_col="A", converters={"A": lambda x: x}
 | 
						|
    )
 | 
						|
 | 
						|
    xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A"))
 | 
						|
    tm.assert_frame_equal(rs, xp)
 | 
						|
 | 
						|
 | 
						|
def test_converter_multi_index(all_parsers):
 | 
						|
    # GH 42446
 | 
						|
    parser = all_parsers
 | 
						|
    data = "A,B,B\nX,Y,Z\n1,2,3"
 | 
						|
 | 
						|
    result = parser.read_csv(
 | 
						|
        StringIO(data),
 | 
						|
        header=list(range(2)),
 | 
						|
        converters={
 | 
						|
            ("A", "X"): np.int32,
 | 
						|
            ("B", "Y"): np.int32,
 | 
						|
            ("B", "Z"): np.float32,
 | 
						|
        },
 | 
						|
    )
 | 
						|
 | 
						|
    expected = DataFrame(
 | 
						|
        {
 | 
						|
            ("A", "X"): np.int32([1]),
 | 
						|
            ("B", "Y"): np.int32([2]),
 | 
						|
            ("B", "Z"): np.float32([3]),
 | 
						|
        }
 | 
						|
    )
 | 
						|
 | 
						|
    tm.assert_frame_equal(result, expected)
 |