351 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			351 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Tests the TextReader class in parsers.pyx, which
 | 
						|
is integral to the C engine in parsers.py
 | 
						|
"""
 | 
						|
from io import (
 | 
						|
    BytesIO,
 | 
						|
    StringIO,
 | 
						|
)
 | 
						|
import os
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
import pandas._libs.parsers as parser
 | 
						|
from pandas._libs.parsers import TextReader
 | 
						|
 | 
						|
from pandas import DataFrame
 | 
						|
import pandas._testing as tm
 | 
						|
 | 
						|
from pandas.io.parsers import (
 | 
						|
    TextFileReader,
 | 
						|
    read_csv,
 | 
						|
)
 | 
						|
from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs
 | 
						|
 | 
						|
 | 
						|
class TestTextReader:
 | 
						|
    @pytest.fixture(autouse=True)
 | 
						|
    def setup_method(self, datapath):
 | 
						|
        self.dirpath = datapath("io", "parser", "data")
 | 
						|
        csv1_dirpath = datapath("io", "data", "csv")
 | 
						|
        self.csv1 = os.path.join(csv1_dirpath, "test1.csv")
 | 
						|
        self.csv2 = os.path.join(self.dirpath, "test2.csv")
 | 
						|
        self.xls1 = os.path.join(self.dirpath, "test.xls")
 | 
						|
 | 
						|
    def test_file_handle(self):
 | 
						|
        with open(self.csv1, "rb") as f:
 | 
						|
            reader = TextReader(f)
 | 
						|
            reader.read()
 | 
						|
 | 
						|
    def test_file_handle_mmap(self):
 | 
						|
        # this was never using memory_map=True
 | 
						|
        with open(self.csv1, "rb") as f:
 | 
						|
            reader = TextReader(f, header=None)
 | 
						|
            reader.read()
 | 
						|
 | 
						|
    def test_StringIO(self):
 | 
						|
        with open(self.csv1, "rb") as f:
 | 
						|
            text = f.read()
 | 
						|
        src = BytesIO(text)
 | 
						|
        reader = TextReader(src, header=None)
 | 
						|
        reader.read()
 | 
						|
 | 
						|
    def test_string_factorize(self):
 | 
						|
        # should this be optional?
 | 
						|
        data = "a\nb\na\nb\na"
 | 
						|
        reader = TextReader(StringIO(data), header=None)
 | 
						|
        result = reader.read()
 | 
						|
        assert len(set(map(id, result[0]))) == 2
 | 
						|
 | 
						|
    def test_skipinitialspace(self):
 | 
						|
        data = "a,   b\na,   b\na,   b\na,   b"
 | 
						|
 | 
						|
        reader = TextReader(StringIO(data), skipinitialspace=True, header=None)
 | 
						|
        result = reader.read()
 | 
						|
 | 
						|
        tm.assert_numpy_array_equal(
 | 
						|
            result[0], np.array(["a", "a", "a", "a"], dtype=np.object_)
 | 
						|
        )
 | 
						|
        tm.assert_numpy_array_equal(
 | 
						|
            result[1], np.array(["b", "b", "b", "b"], dtype=np.object_)
 | 
						|
        )
 | 
						|
 | 
						|
    def test_parse_booleans(self):
 | 
						|
        data = "True\nFalse\nTrue\nTrue"
 | 
						|
 | 
						|
        reader = TextReader(StringIO(data), header=None)
 | 
						|
        result = reader.read()
 | 
						|
 | 
						|
        assert result[0].dtype == np.bool_
 | 
						|
 | 
						|
    def test_delimit_whitespace(self):
 | 
						|
        data = 'a  b\na\t\t "b"\n"a"\t \t b'
 | 
						|
 | 
						|
        reader = TextReader(StringIO(data), delim_whitespace=True, header=None)
 | 
						|
        result = reader.read()
 | 
						|
 | 
						|
        tm.assert_numpy_array_equal(
 | 
						|
            result[0], np.array(["a", "a", "a"], dtype=np.object_)
 | 
						|
        )
 | 
						|
        tm.assert_numpy_array_equal(
 | 
						|
            result[1], np.array(["b", "b", "b"], dtype=np.object_)
 | 
						|
        )
 | 
						|
 | 
						|
    def test_embedded_newline(self):
 | 
						|
        data = 'a\n"hello\nthere"\nthis'
 | 
						|
 | 
						|
        reader = TextReader(StringIO(data), header=None)
 | 
						|
        result = reader.read()
 | 
						|
 | 
						|
        expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_)
 | 
						|
        tm.assert_numpy_array_equal(result[0], expected)
 | 
						|
 | 
						|
    def test_euro_decimal(self):
 | 
						|
        data = "12345,67\n345,678"
 | 
						|
 | 
						|
        reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None)
 | 
						|
        result = reader.read()
 | 
						|
 | 
						|
        expected = np.array([12345.67, 345.678])
 | 
						|
        tm.assert_almost_equal(result[0], expected)
 | 
						|
 | 
						|
    def test_integer_thousands(self):
 | 
						|
        data = "123,456\n12,500"
 | 
						|
 | 
						|
        reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None)
 | 
						|
        result = reader.read()
 | 
						|
 | 
						|
        expected = np.array([123456, 12500], dtype=np.int64)
 | 
						|
        tm.assert_almost_equal(result[0], expected)
 | 
						|
 | 
						|
    def test_integer_thousands_alt(self):
 | 
						|
        data = "123.456\n12.500"
 | 
						|
 | 
						|
        reader = TextFileReader(
 | 
						|
            StringIO(data), delimiter=":", thousands=".", header=None
 | 
						|
        )
 | 
						|
        result = reader.read()
 | 
						|
 | 
						|
        expected = DataFrame([123456, 12500])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_skip_bad_lines(self, capsys):
 | 
						|
        # too many lines, see #2430 for why
 | 
						|
        data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
 | 
						|
 | 
						|
        reader = TextReader(StringIO(data), delimiter=":", header=None)
 | 
						|
        msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4"
 | 
						|
        with pytest.raises(parser.ParserError, match=msg):
 | 
						|
            reader.read()
 | 
						|
 | 
						|
        reader = TextReader(
 | 
						|
            StringIO(data), delimiter=":", header=None, on_bad_lines=2  # Skip
 | 
						|
        )
 | 
						|
        result = reader.read()
 | 
						|
        expected = {
 | 
						|
            0: np.array(["a", "d", "g", "l"], dtype=object),
 | 
						|
            1: np.array(["b", "e", "h", "m"], dtype=object),
 | 
						|
            2: np.array(["c", "f", "i", "n"], dtype=object),
 | 
						|
        }
 | 
						|
        assert_array_dicts_equal(result, expected)
 | 
						|
 | 
						|
        reader = TextReader(
 | 
						|
            StringIO(data), delimiter=":", header=None, on_bad_lines=1  # Warn
 | 
						|
        )
 | 
						|
        reader.read()
 | 
						|
        captured = capsys.readouterr()
 | 
						|
 | 
						|
        assert "Skipping line 4" in captured.err
 | 
						|
        assert "Skipping line 6" in captured.err
 | 
						|
 | 
						|
    def test_header_not_enough_lines(self):
 | 
						|
        data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
 | 
						|
 | 
						|
        reader = TextReader(StringIO(data), delimiter=",", header=2)
 | 
						|
        header = reader.header
 | 
						|
        expected = [["a", "b", "c"]]
 | 
						|
        assert header == expected
 | 
						|
 | 
						|
        recs = reader.read()
 | 
						|
        expected = {
 | 
						|
            0: np.array([1, 4], dtype=np.int64),
 | 
						|
            1: np.array([2, 5], dtype=np.int64),
 | 
						|
            2: np.array([3, 6], dtype=np.int64),
 | 
						|
        }
 | 
						|
        assert_array_dicts_equal(recs, expected)
 | 
						|
 | 
						|
    def test_escapechar(self):
 | 
						|
        data = '\\"hello world"\n\\"hello world"\n\\"hello world"'
 | 
						|
 | 
						|
        reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\")
 | 
						|
        result = reader.read()
 | 
						|
        expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
 | 
						|
        assert_array_dicts_equal(result, expected)
 | 
						|
 | 
						|
    def test_eof_has_eol(self):
 | 
						|
        # handling of new line at EOF
 | 
						|
        pass
 | 
						|
 | 
						|
    def test_na_substitution(self):
 | 
						|
        pass
 | 
						|
 | 
						|
    def test_numpy_string_dtype(self):
 | 
						|
        data = """\
 | 
						|
a,1
 | 
						|
aa,2
 | 
						|
aaa,3
 | 
						|
aaaa,4
 | 
						|
aaaaa,5"""
 | 
						|
 | 
						|
        def _make_reader(**kwds):
 | 
						|
            if "dtype" in kwds:
 | 
						|
                kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
 | 
						|
            return TextReader(StringIO(data), delimiter=",", header=None, **kwds)
 | 
						|
 | 
						|
        reader = _make_reader(dtype="S5,i4")
 | 
						|
        result = reader.read()
 | 
						|
 | 
						|
        assert result[0].dtype == "S5"
 | 
						|
 | 
						|
        ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5")
 | 
						|
        assert (result[0] == ex_values).all()
 | 
						|
        assert result[1].dtype == "i4"
 | 
						|
 | 
						|
        reader = _make_reader(dtype="S4")
 | 
						|
        result = reader.read()
 | 
						|
        assert result[0].dtype == "S4"
 | 
						|
        ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4")
 | 
						|
        assert (result[0] == ex_values).all()
 | 
						|
        assert result[1].dtype == "S4"
 | 
						|
 | 
						|
    def test_pass_dtype(self):
 | 
						|
        data = """\
 | 
						|
one,two
 | 
						|
1,a
 | 
						|
2,b
 | 
						|
3,c
 | 
						|
4,d"""
 | 
						|
 | 
						|
        def _make_reader(**kwds):
 | 
						|
            if "dtype" in kwds:
 | 
						|
                kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
 | 
						|
            return TextReader(StringIO(data), delimiter=",", **kwds)
 | 
						|
 | 
						|
        reader = _make_reader(dtype={"one": "u1", 1: "S1"})
 | 
						|
        result = reader.read()
 | 
						|
        assert result[0].dtype == "u1"
 | 
						|
        assert result[1].dtype == "S1"
 | 
						|
 | 
						|
        reader = _make_reader(dtype={"one": np.uint8, 1: object})
 | 
						|
        result = reader.read()
 | 
						|
        assert result[0].dtype == "u1"
 | 
						|
        assert result[1].dtype == "O"
 | 
						|
 | 
						|
        reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")})
 | 
						|
        result = reader.read()
 | 
						|
        assert result[0].dtype == "u1"
 | 
						|
        assert result[1].dtype == "O"
 | 
						|
 | 
						|
    def test_usecols(self):
 | 
						|
        data = """\
 | 
						|
a,b,c
 | 
						|
1,2,3
 | 
						|
4,5,6
 | 
						|
7,8,9
 | 
						|
10,11,12"""
 | 
						|
 | 
						|
        def _make_reader(**kwds):
 | 
						|
            return TextReader(StringIO(data), delimiter=",", **kwds)
 | 
						|
 | 
						|
        reader = _make_reader(usecols=(1, 2))
 | 
						|
        result = reader.read()
 | 
						|
 | 
						|
        exp = _make_reader().read()
 | 
						|
        assert len(result) == 2
 | 
						|
        assert (result[1] == exp[1]).all()
 | 
						|
        assert (result[2] == exp[2]).all()
 | 
						|
 | 
						|
    def test_cr_delimited(self):
 | 
						|
        def _test(text, **kwargs):
 | 
						|
            nice_text = text.replace("\r", "\r\n")
 | 
						|
            result = TextReader(StringIO(text), **kwargs).read()
 | 
						|
            expected = TextReader(StringIO(nice_text), **kwargs).read()
 | 
						|
            assert_array_dicts_equal(result, expected)
 | 
						|
 | 
						|
        data = "a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12"
 | 
						|
        _test(data, delimiter=",")
 | 
						|
 | 
						|
        data = "a  b  c\r1  2  3\r4  5  6\r7  8  9\r10  11  12"
 | 
						|
        _test(data, delim_whitespace=True)
 | 
						|
 | 
						|
        data = "a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12"
 | 
						|
        _test(data, delimiter=",")
 | 
						|
 | 
						|
        sample = (
 | 
						|
            "A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r"
 | 
						|
            "AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r"
 | 
						|
            ",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0"
 | 
						|
        )
 | 
						|
        _test(sample, delimiter=",")
 | 
						|
 | 
						|
        data = "A  B  C\r  2  3\r4  5  6"
 | 
						|
        _test(data, delim_whitespace=True)
 | 
						|
 | 
						|
        data = "A B C\r2 3\r4 5 6"
 | 
						|
        _test(data, delim_whitespace=True)
 | 
						|
 | 
						|
    def test_empty_field_eof(self):
 | 
						|
        data = "a,b,c\n1,2,3\n4,,"
 | 
						|
 | 
						|
        result = TextReader(StringIO(data), delimiter=",").read()
 | 
						|
 | 
						|
        expected = {
 | 
						|
            0: np.array([1, 4], dtype=np.int64),
 | 
						|
            1: np.array(["2", ""], dtype=object),
 | 
						|
            2: np.array(["3", ""], dtype=object),
 | 
						|
        }
 | 
						|
        assert_array_dicts_equal(result, expected)
 | 
						|
 | 
						|
        # GH5664
 | 
						|
        a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
 | 
						|
        b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
 | 
						|
        c = DataFrame(
 | 
						|
            [
 | 
						|
                [1, 2, 3, 4],
 | 
						|
                [6, np.nan, np.nan, np.nan],
 | 
						|
                [8, 9, 10, 11],
 | 
						|
                [13, 14, np.nan, np.nan],
 | 
						|
            ],
 | 
						|
            columns=list("abcd"),
 | 
						|
            index=[0, 5, 7, 12],
 | 
						|
        )
 | 
						|
 | 
						|
        for _ in range(100):
 | 
						|
            df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
 | 
						|
            tm.assert_frame_equal(df, a)
 | 
						|
 | 
						|
            df = read_csv(
 | 
						|
                StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
 | 
						|
            )
 | 
						|
            tm.assert_frame_equal(df, b)
 | 
						|
 | 
						|
            df = read_csv(
 | 
						|
                StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
 | 
						|
                names=list("abcd"),
 | 
						|
                engine="c",
 | 
						|
            )
 | 
						|
            tm.assert_frame_equal(df, c)
 | 
						|
 | 
						|
    def test_empty_csv_input(self):
 | 
						|
        # GH14867
 | 
						|
        with read_csv(
 | 
						|
            StringIO(), chunksize=20, header=None, names=["a", "b", "c"]
 | 
						|
        ) as df:
 | 
						|
            assert isinstance(df, TextFileReader)
 | 
						|
 | 
						|
 | 
						|
def assert_array_dicts_equal(left, right):
 | 
						|
    for k, v in left.items():
 | 
						|
        tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k]))
 |