1587 lines
		
	
	
		
			56 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			1587 lines
		
	
	
		
			56 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from datetime import (
 | 
						|
    datetime,
 | 
						|
    time,
 | 
						|
)
 | 
						|
from functools import partial
 | 
						|
import os
 | 
						|
from pathlib import Path
 | 
						|
from urllib.error import URLError
 | 
						|
from zipfile import BadZipFile
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
import pandas.util._test_decorators as td
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas import (
 | 
						|
    DataFrame,
 | 
						|
    Index,
 | 
						|
    MultiIndex,
 | 
						|
    Series,
 | 
						|
)
 | 
						|
import pandas._testing as tm
 | 
						|
from pandas.tests.io.excel import xlrd_version
 | 
						|
from pandas.util.version import Version
 | 
						|
 | 
						|
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
 | 
						|
engine_params = [
 | 
						|
    # Add any engines to test here
 | 
						|
    # When defusedxml is installed it triggers deprecation warnings for
 | 
						|
    # xlrd and openpyxl, so catch those here
 | 
						|
    pytest.param(
 | 
						|
        "xlrd",
 | 
						|
        marks=[
 | 
						|
            td.skip_if_no("xlrd"),
 | 
						|
        ],
 | 
						|
    ),
 | 
						|
    pytest.param(
 | 
						|
        "openpyxl",
 | 
						|
        marks=[
 | 
						|
            td.skip_if_no("openpyxl"),
 | 
						|
            pytest.mark.filterwarnings("ignore:.*html argument"),
 | 
						|
        ],
 | 
						|
    ),
 | 
						|
    pytest.param(
 | 
						|
        None,
 | 
						|
        marks=[
 | 
						|
            td.skip_if_no("xlrd"),
 | 
						|
        ],
 | 
						|
    ),
 | 
						|
    pytest.param("pyxlsb", marks=td.skip_if_no("pyxlsb")),
 | 
						|
    pytest.param("odf", marks=td.skip_if_no("odf")),
 | 
						|
]
 | 
						|
 | 
						|
 | 
						|
def _is_valid_engine_ext_pair(engine, read_ext: str) -> bool:
 | 
						|
    """
 | 
						|
    Filter out invalid (engine, ext) pairs instead of skipping, as that
 | 
						|
    produces 500+ pytest.skips.
 | 
						|
    """
 | 
						|
    engine = engine.values[0]
 | 
						|
    if engine == "openpyxl" and read_ext == ".xls":
 | 
						|
        return False
 | 
						|
    if engine == "odf" and read_ext != ".ods":
 | 
						|
        return False
 | 
						|
    if read_ext == ".ods" and engine != "odf":
 | 
						|
        return False
 | 
						|
    if engine == "pyxlsb" and read_ext != ".xlsb":
 | 
						|
        return False
 | 
						|
    if read_ext == ".xlsb" and engine != "pyxlsb":
 | 
						|
        return False
 | 
						|
    if (
 | 
						|
        engine == "xlrd"
 | 
						|
        and xlrd_version is not None
 | 
						|
        and xlrd_version >= Version("2")
 | 
						|
        and read_ext != ".xls"
 | 
						|
    ):
 | 
						|
        return False
 | 
						|
    return True
 | 
						|
 | 
						|
 | 
						|
def _transfer_marks(engine, read_ext):
 | 
						|
    """
 | 
						|
    engine gives us a pytest.param object with some marks, read_ext is just
 | 
						|
    a string.  We need to generate a new pytest.param inheriting the marks.
 | 
						|
    """
 | 
						|
    values = engine.values + (read_ext,)
 | 
						|
    new_param = pytest.param(values, marks=engine.marks)
 | 
						|
    return new_param
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(
 | 
						|
    params=[
 | 
						|
        _transfer_marks(eng, ext)
 | 
						|
        for eng in engine_params
 | 
						|
        for ext in read_ext_params
 | 
						|
        if _is_valid_engine_ext_pair(eng, ext)
 | 
						|
    ],
 | 
						|
)
 | 
						|
def engine_and_read_ext(request):
 | 
						|
    """
 | 
						|
    Fixture for Excel reader engine and read_ext, only including valid pairs.
 | 
						|
    """
 | 
						|
    return request.param
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def engine(engine_and_read_ext):
 | 
						|
    engine, read_ext = engine_and_read_ext
 | 
						|
    return engine
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def read_ext(engine_and_read_ext):
 | 
						|
    engine, read_ext = engine_and_read_ext
 | 
						|
    return read_ext
 | 
						|
 | 
						|
 | 
						|
class TestReaders:
 | 
						|
    @pytest.fixture(autouse=True)
 | 
						|
    def cd_and_set_engine(self, engine, datapath, monkeypatch):
 | 
						|
        """
 | 
						|
        Change directory and set engine for read_excel calls.
 | 
						|
        """
 | 
						|
        func = partial(pd.read_excel, engine=engine)
 | 
						|
        monkeypatch.chdir(datapath("io", "data", "excel"))
 | 
						|
        monkeypatch.setattr(pd, "read_excel", func)
 | 
						|
 | 
						|
    def test_engine_used(self, read_ext, engine, monkeypatch):
 | 
						|
        # GH 38884
 | 
						|
        def parser(self, *args, **kwargs):
 | 
						|
            return self.engine
 | 
						|
 | 
						|
        monkeypatch.setattr(pd.ExcelFile, "parse", parser)
 | 
						|
 | 
						|
        expected_defaults = {
 | 
						|
            "xlsx": "openpyxl",
 | 
						|
            "xlsm": "openpyxl",
 | 
						|
            "xlsb": "pyxlsb",
 | 
						|
            "xls": "xlrd",
 | 
						|
            "ods": "odf",
 | 
						|
        }
 | 
						|
 | 
						|
        with open("test1" + read_ext, "rb") as f:
 | 
						|
            result = pd.read_excel(f)
 | 
						|
 | 
						|
        if engine is not None:
 | 
						|
            expected = engine
 | 
						|
        else:
 | 
						|
            expected = expected_defaults[read_ext[1:]]
 | 
						|
        assert result == expected
 | 
						|
 | 
						|
    def test_usecols_int(self, read_ext):
 | 
						|
        # usecols as int
 | 
						|
        msg = "Passing an integer for `usecols`"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            pd.read_excel(
 | 
						|
                "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3
 | 
						|
            )
 | 
						|
 | 
						|
        # usecols as int
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            pd.read_excel(
 | 
						|
                "test1" + read_ext,
 | 
						|
                sheet_name="Sheet2",
 | 
						|
                skiprows=[1],
 | 
						|
                index_col=0,
 | 
						|
                usecols=3,
 | 
						|
            )
 | 
						|
 | 
						|
    def test_usecols_list(self, request, read_ext, df_ref):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        df_ref = df_ref.reindex(columns=["B", "C"])
 | 
						|
        df1 = pd.read_excel(
 | 
						|
            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=[0, 2, 3]
 | 
						|
        )
 | 
						|
        df2 = pd.read_excel(
 | 
						|
            "test1" + read_ext,
 | 
						|
            sheet_name="Sheet2",
 | 
						|
            skiprows=[1],
 | 
						|
            index_col=0,
 | 
						|
            usecols=[0, 2, 3],
 | 
						|
        )
 | 
						|
 | 
						|
        # TODO add index to xls file)
 | 
						|
        tm.assert_frame_equal(df1, df_ref, check_names=False)
 | 
						|
        tm.assert_frame_equal(df2, df_ref, check_names=False)
 | 
						|
 | 
						|
    def test_usecols_str(self, request, read_ext, df_ref):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        df1 = df_ref.reindex(columns=["A", "B", "C"])
 | 
						|
        df2 = pd.read_excel(
 | 
						|
            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A:D"
 | 
						|
        )
 | 
						|
        df3 = pd.read_excel(
 | 
						|
            "test1" + read_ext,
 | 
						|
            sheet_name="Sheet2",
 | 
						|
            skiprows=[1],
 | 
						|
            index_col=0,
 | 
						|
            usecols="A:D",
 | 
						|
        )
 | 
						|
 | 
						|
        # TODO add index to xls, read xls ignores index name ?
 | 
						|
        tm.assert_frame_equal(df2, df1, check_names=False)
 | 
						|
        tm.assert_frame_equal(df3, df1, check_names=False)
 | 
						|
 | 
						|
        df1 = df_ref.reindex(columns=["B", "C"])
 | 
						|
        df2 = pd.read_excel(
 | 
						|
            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C,D"
 | 
						|
        )
 | 
						|
        df3 = pd.read_excel(
 | 
						|
            "test1" + read_ext,
 | 
						|
            sheet_name="Sheet2",
 | 
						|
            skiprows=[1],
 | 
						|
            index_col=0,
 | 
						|
            usecols="A,C,D",
 | 
						|
        )
 | 
						|
        # TODO add index to xls file
 | 
						|
        tm.assert_frame_equal(df2, df1, check_names=False)
 | 
						|
        tm.assert_frame_equal(df3, df1, check_names=False)
 | 
						|
 | 
						|
        df1 = df_ref.reindex(columns=["B", "C"])
 | 
						|
        df2 = pd.read_excel(
 | 
						|
            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,C:D"
 | 
						|
        )
 | 
						|
        df3 = pd.read_excel(
 | 
						|
            "test1" + read_ext,
 | 
						|
            sheet_name="Sheet2",
 | 
						|
            skiprows=[1],
 | 
						|
            index_col=0,
 | 
						|
            usecols="A,C:D",
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(df2, df1, check_names=False)
 | 
						|
        tm.assert_frame_equal(df3, df1, check_names=False)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "usecols", [[0, 1, 3], [0, 3, 1], [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0]]
 | 
						|
    )
 | 
						|
    def test_usecols_diff_positional_int_columns_order(
 | 
						|
        self, request, read_ext, usecols, df_ref
 | 
						|
    ):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        expected = df_ref[["A", "C"]]
 | 
						|
        result = pd.read_excel(
 | 
						|
            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=usecols
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected, check_names=False)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("usecols", [["B", "D"], ["D", "B"]])
 | 
						|
    def test_usecols_diff_positional_str_columns_order(self, read_ext, usecols, df_ref):
 | 
						|
        expected = df_ref[["B", "D"]]
 | 
						|
        expected.index = range(len(expected))
 | 
						|
 | 
						|
        result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols=usecols)
 | 
						|
        tm.assert_frame_equal(result, expected, check_names=False)
 | 
						|
 | 
						|
    def test_read_excel_without_slicing(self, request, read_ext, df_ref):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        expected = df_ref
 | 
						|
        result = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
 | 
						|
        tm.assert_frame_equal(result, expected, check_names=False)
 | 
						|
 | 
						|
    def test_usecols_excel_range_str(self, request, read_ext, df_ref):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        expected = df_ref[["C", "D"]]
 | 
						|
        result = pd.read_excel(
 | 
						|
            "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols="A,D:E"
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected, check_names=False)
 | 
						|
 | 
						|
    def test_usecols_excel_range_str_invalid(self, read_ext):
 | 
						|
        msg = "Invalid column name: E1"
 | 
						|
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            pd.read_excel("test1" + read_ext, sheet_name="Sheet1", usecols="D:E1")
 | 
						|
 | 
						|
    def test_index_col_label_error(self, read_ext):
 | 
						|
        msg = "list indices must be integers.*, not str"
 | 
						|
 | 
						|
        with pytest.raises(TypeError, match=msg):
 | 
						|
            pd.read_excel(
 | 
						|
                "test1" + read_ext,
 | 
						|
                sheet_name="Sheet1",
 | 
						|
                index_col=["A"],
 | 
						|
                usecols=["A", "C"],
 | 
						|
            )
 | 
						|
 | 
						|
    def test_index_col_empty(self, read_ext):
 | 
						|
        # see gh-9208
 | 
						|
        result = pd.read_excel(
 | 
						|
            "test1" + read_ext, sheet_name="Sheet3", index_col=["A", "B", "C"]
 | 
						|
        )
 | 
						|
        expected = DataFrame(
 | 
						|
            columns=["D", "E", "F"],
 | 
						|
            index=MultiIndex(levels=[[]] * 3, codes=[[]] * 3, names=["A", "B", "C"]),
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("index_col", [None, 2])
 | 
						|
    def test_index_col_with_unnamed(self, read_ext, index_col):
 | 
						|
        # see gh-18792
 | 
						|
        result = pd.read_excel(
 | 
						|
            "test1" + read_ext, sheet_name="Sheet4", index_col=index_col
 | 
						|
        )
 | 
						|
        expected = DataFrame(
 | 
						|
            [["i1", "a", "x"], ["i2", "b", "y"]], columns=["Unnamed: 0", "col1", "col2"]
 | 
						|
        )
 | 
						|
        if index_col:
 | 
						|
            expected = expected.set_index(expected.columns[index_col])
 | 
						|
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_usecols_pass_non_existent_column(self, read_ext):
 | 
						|
        msg = (
 | 
						|
            "Usecols do not match columns, "
 | 
						|
            "columns expected but not found: " + r"\['E'\]"
 | 
						|
        )
 | 
						|
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            pd.read_excel("test1" + read_ext, usecols=["E"])
 | 
						|
 | 
						|
    def test_usecols_wrong_type(self, read_ext):
 | 
						|
        msg = (
 | 
						|
            "'usecols' must either be list-like of "
 | 
						|
            "all strings, all unicode, all integers or a callable."
 | 
						|
        )
 | 
						|
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            pd.read_excel("test1" + read_ext, usecols=["E1", 0])
 | 
						|
 | 
						|
    def test_excel_stop_iterator(self, read_ext):
 | 
						|
 | 
						|
        parsed = pd.read_excel("test2" + read_ext, sheet_name="Sheet1")
 | 
						|
        expected = DataFrame([["aaaa", "bbbbb"]], columns=["Test", "Test1"])
 | 
						|
        tm.assert_frame_equal(parsed, expected)
 | 
						|
 | 
						|
    def test_excel_cell_error_na(self, request, read_ext):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        parsed = pd.read_excel("test3" + read_ext, sheet_name="Sheet1")
 | 
						|
        expected = DataFrame([[np.nan]], columns=["Test"])
 | 
						|
        tm.assert_frame_equal(parsed, expected)
 | 
						|
 | 
						|
    def test_excel_table(self, request, read_ext, df_ref):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        df1 = pd.read_excel("test1" + read_ext, sheet_name="Sheet1", index_col=0)
 | 
						|
        df2 = pd.read_excel(
 | 
						|
            "test1" + read_ext, sheet_name="Sheet2", skiprows=[1], index_col=0
 | 
						|
        )
 | 
						|
        # TODO add index to file
 | 
						|
        tm.assert_frame_equal(df1, df_ref, check_names=False)
 | 
						|
        tm.assert_frame_equal(df2, df_ref, check_names=False)
 | 
						|
 | 
						|
        df3 = pd.read_excel(
 | 
						|
            "test1" + read_ext, sheet_name="Sheet1", index_col=0, skipfooter=1
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(df3, df1.iloc[:-1])
 | 
						|
 | 
						|
    def test_reader_special_dtypes(self, request, read_ext):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        expected = DataFrame.from_dict(
 | 
						|
            {
 | 
						|
                "IntCol": [1, 2, -3, 4, 0],
 | 
						|
                "FloatCol": [1.25, 2.25, 1.83, 1.92, 0.0000000005],
 | 
						|
                "BoolCol": [True, False, True, True, False],
 | 
						|
                "StrCol": [1, 2, 3, 4, 5],
 | 
						|
                # GH5394 - this is why convert_float isn't vectorized
 | 
						|
                "Str2Col": ["a", 3, "c", "d", "e"],
 | 
						|
                "DateCol": [
 | 
						|
                    datetime(2013, 10, 30),
 | 
						|
                    datetime(2013, 10, 31),
 | 
						|
                    datetime(1905, 1, 1),
 | 
						|
                    datetime(2013, 12, 14),
 | 
						|
                    datetime(2015, 3, 14),
 | 
						|
                ],
 | 
						|
            },
 | 
						|
        )
 | 
						|
        basename = "test_types"
 | 
						|
 | 
						|
        # should read in correctly and infer types
 | 
						|
        actual = pd.read_excel(basename + read_ext, sheet_name="Sheet1")
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        # if not coercing number, then int comes in as float
 | 
						|
        float_expected = expected.copy()
 | 
						|
        float_expected["IntCol"] = float_expected["IntCol"].astype(float)
 | 
						|
        float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
 | 
						|
        with tm.assert_produces_warning(
 | 
						|
            FutureWarning,
 | 
						|
            match="convert_float is deprecated",
 | 
						|
            raise_on_extra_warnings=False,
 | 
						|
        ):
 | 
						|
            # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning
 | 
						|
            # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml)
 | 
						|
            # See GH#41176
 | 
						|
            actual = pd.read_excel(
 | 
						|
                basename + read_ext, sheet_name="Sheet1", convert_float=False
 | 
						|
            )
 | 
						|
        tm.assert_frame_equal(actual, float_expected)
 | 
						|
 | 
						|
        # check setting Index (assuming xls and xlsx are the same here)
 | 
						|
        for icol, name in enumerate(expected.columns):
 | 
						|
            actual = pd.read_excel(
 | 
						|
                basename + read_ext, sheet_name="Sheet1", index_col=icol
 | 
						|
            )
 | 
						|
            exp = expected.set_index(name)
 | 
						|
            tm.assert_frame_equal(actual, exp)
 | 
						|
 | 
						|
        # convert_float and converters should be different but both accepted
 | 
						|
        expected["StrCol"] = expected["StrCol"].apply(str)
 | 
						|
        actual = pd.read_excel(
 | 
						|
            basename + read_ext, sheet_name="Sheet1", converters={"StrCol": str}
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        no_convert_float = float_expected.copy()
 | 
						|
        no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
 | 
						|
        with tm.assert_produces_warning(
 | 
						|
            FutureWarning,
 | 
						|
            match="convert_float is deprecated",
 | 
						|
            raise_on_extra_warnings=False,
 | 
						|
        ):
 | 
						|
            # raise_on_extra_warnings because xlrd raises a PendingDeprecationWarning
 | 
						|
            # on database job Linux_py37_IO (ci/deps/actions-37-db.yaml)
 | 
						|
            # See GH#41176
 | 
						|
            actual = pd.read_excel(
 | 
						|
                basename + read_ext,
 | 
						|
                sheet_name="Sheet1",
 | 
						|
                convert_float=False,
 | 
						|
                converters={"StrCol": str},
 | 
						|
            )
 | 
						|
        tm.assert_frame_equal(actual, no_convert_float)
 | 
						|
 | 
						|
    # GH8212 - support for converters and missing values
 | 
						|
    def test_reader_converters(self, read_ext):
 | 
						|
 | 
						|
        basename = "test_converters"
 | 
						|
 | 
						|
        expected = DataFrame.from_dict(
 | 
						|
            {
 | 
						|
                "IntCol": [1, 2, -3, -1000, 0],
 | 
						|
                "FloatCol": [12.5, np.nan, 18.3, 19.2, 0.000000005],
 | 
						|
                "BoolCol": ["Found", "Found", "Found", "Not found", "Found"],
 | 
						|
                "StrCol": ["1", np.nan, "3", "4", "5"],
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        converters = {
 | 
						|
            "IntCol": lambda x: int(x) if x != "" else -1000,
 | 
						|
            "FloatCol": lambda x: 10 * x if x else np.nan,
 | 
						|
            2: lambda x: "Found" if x != "" else "Not found",
 | 
						|
            3: lambda x: str(x) if x else "",
 | 
						|
        }
 | 
						|
 | 
						|
        # should read in correctly and set types of single cells (not array
 | 
						|
        # dtypes)
 | 
						|
        actual = pd.read_excel(
 | 
						|
            basename + read_ext, sheet_name="Sheet1", converters=converters
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    def test_reader_dtype(self, read_ext):
 | 
						|
        # GH 8212
 | 
						|
        basename = "testdtype"
 | 
						|
        actual = pd.read_excel(basename + read_ext)
 | 
						|
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "a": [1, 2, 3, 4],
 | 
						|
                "b": [2.5, 3.5, 4.5, 5.5],
 | 
						|
                "c": [1, 2, 3, 4],
 | 
						|
                "d": [1.0, 2.0, np.nan, 4.0],
 | 
						|
            }
 | 
						|
        ).reindex(columns=["a", "b", "c", "d"])
 | 
						|
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        actual = pd.read_excel(
 | 
						|
            basename + read_ext, dtype={"a": "float64", "b": "float32", "c": str}
 | 
						|
        )
 | 
						|
 | 
						|
        expected["a"] = expected["a"].astype("float64")
 | 
						|
        expected["b"] = expected["b"].astype("float32")
 | 
						|
        expected["c"] = ["001", "002", "003", "004"]
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        msg = "Unable to convert column d to type int64"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            pd.read_excel(basename + read_ext, dtype={"d": "int64"})
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "dtype,expected",
 | 
						|
        [
 | 
						|
            (
 | 
						|
                None,
 | 
						|
                DataFrame(
 | 
						|
                    {
 | 
						|
                        "a": [1, 2, 3, 4],
 | 
						|
                        "b": [2.5, 3.5, 4.5, 5.5],
 | 
						|
                        "c": [1, 2, 3, 4],
 | 
						|
                        "d": [1.0, 2.0, np.nan, 4.0],
 | 
						|
                    }
 | 
						|
                ),
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                {"a": "float64", "b": "float32", "c": str, "d": str},
 | 
						|
                DataFrame(
 | 
						|
                    {
 | 
						|
                        "a": Series([1, 2, 3, 4], dtype="float64"),
 | 
						|
                        "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
 | 
						|
                        "c": ["001", "002", "003", "004"],
 | 
						|
                        "d": ["1", "2", np.nan, "4"],
 | 
						|
                    }
 | 
						|
                ),
 | 
						|
            ),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_reader_dtype_str(self, read_ext, dtype, expected):
 | 
						|
        # see gh-20377
 | 
						|
        basename = "testdtype"
 | 
						|
 | 
						|
        actual = pd.read_excel(basename + read_ext, dtype=dtype)
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
 | 
						|
    def test_dtype_mangle_dup_cols(self, read_ext, dtypes, exp_value):
 | 
						|
        # GH#35211
 | 
						|
        basename = "df_mangle_dup_col_dtypes"
 | 
						|
        dtype_dict = {"a": str, **dtypes}
 | 
						|
        dtype_dict_copy = dtype_dict.copy()
 | 
						|
        # GH#42462
 | 
						|
        result = pd.read_excel(basename + read_ext, dtype=dtype_dict)
 | 
						|
        expected = DataFrame({"a": ["1"], "a.1": [exp_value]})
 | 
						|
        assert dtype_dict == dtype_dict_copy, "dtype dict changed"
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_reader_spaces(self, read_ext):
 | 
						|
        # see gh-32207
 | 
						|
        basename = "test_spaces"
 | 
						|
 | 
						|
        actual = pd.read_excel(basename + read_ext)
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "testcol": [
 | 
						|
                    "this is great",
 | 
						|
                    "4    spaces",
 | 
						|
                    "1 trailing ",
 | 
						|
                    " 1 leading",
 | 
						|
                    "2  spaces  multiple  times",
 | 
						|
                ]
 | 
						|
            }
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    # gh-36122, gh-35802
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "basename,expected",
 | 
						|
        [
 | 
						|
            ("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})),
 | 
						|
            ("gh-36122", DataFrame(columns=["got 2nd sa"])),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_read_excel_ods_nested_xml(self, engine, read_ext, basename, expected):
 | 
						|
        # see gh-35802
 | 
						|
        if engine != "odf":
 | 
						|
            pytest.skip(f"Skipped for engine: {engine}")
 | 
						|
 | 
						|
        actual = pd.read_excel(basename + read_ext)
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    def test_reading_all_sheets(self, read_ext):
 | 
						|
        # Test reading all sheet names by setting sheet_name to None,
 | 
						|
        # Ensure a dict is returned.
 | 
						|
        # See PR #9450
 | 
						|
        basename = "test_multisheet"
 | 
						|
        dfs = pd.read_excel(basename + read_ext, sheet_name=None)
 | 
						|
        # ensure this is not alphabetical to test order preservation
 | 
						|
        expected_keys = ["Charlie", "Alpha", "Beta"]
 | 
						|
        tm.assert_contains_all(expected_keys, dfs.keys())
 | 
						|
        # Issue 9930
 | 
						|
        # Ensure sheet order is preserved
 | 
						|
        assert expected_keys == list(dfs.keys())
 | 
						|
 | 
						|
    def test_reading_multiple_specific_sheets(self, read_ext):
 | 
						|
        # Test reading specific sheet names by specifying a mixed list
 | 
						|
        # of integers and strings, and confirm that duplicated sheet
 | 
						|
        # references (positions/names) are removed properly.
 | 
						|
        # Ensure a dict is returned
 | 
						|
        # See PR #9450
 | 
						|
        basename = "test_multisheet"
 | 
						|
        # Explicitly request duplicates. Only the set should be returned.
 | 
						|
        expected_keys = [2, "Charlie", "Charlie"]
 | 
						|
        dfs = pd.read_excel(basename + read_ext, sheet_name=expected_keys)
 | 
						|
        expected_keys = list(set(expected_keys))
 | 
						|
        tm.assert_contains_all(expected_keys, dfs.keys())
 | 
						|
        assert len(expected_keys) == len(dfs.keys())
 | 
						|
 | 
						|
    def test_reading_all_sheets_with_blank(self, read_ext):
 | 
						|
        # Test reading all sheet names by setting sheet_name to None,
 | 
						|
        # In the case where some sheets are blank.
 | 
						|
        # Issue #11711
 | 
						|
        basename = "blank_with_header"
 | 
						|
        dfs = pd.read_excel(basename + read_ext, sheet_name=None)
 | 
						|
        expected_keys = ["Sheet1", "Sheet2", "Sheet3"]
 | 
						|
        tm.assert_contains_all(expected_keys, dfs.keys())
 | 
						|
 | 
						|
    # GH6403
 | 
						|
    def test_read_excel_blank(self, read_ext):
 | 
						|
        actual = pd.read_excel("blank" + read_ext, sheet_name="Sheet1")
 | 
						|
        tm.assert_frame_equal(actual, DataFrame())
 | 
						|
 | 
						|
    def test_read_excel_blank_with_header(self, read_ext):
 | 
						|
        expected = DataFrame(columns=["col_1", "col_2"])
 | 
						|
        actual = pd.read_excel("blank_with_header" + read_ext, sheet_name="Sheet1")
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    def test_date_conversion_overflow(self, request, engine, read_ext):
 | 
						|
        # GH 10001 : pandas.ExcelFile ignore parse_dates=False
 | 
						|
        if engine == "pyxlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        expected = DataFrame(
 | 
						|
            [
 | 
						|
                [pd.Timestamp("2016-03-12"), "Marc Johnson"],
 | 
						|
                [pd.Timestamp("2016-03-16"), "Jack Black"],
 | 
						|
                [1e20, "Timothy Brown"],
 | 
						|
            ],
 | 
						|
            columns=["DateColWithBigInt", "StringCol"],
 | 
						|
        )
 | 
						|
 | 
						|
        if engine == "openpyxl":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(reason="Maybe not supported by openpyxl")
 | 
						|
            )
 | 
						|
 | 
						|
        if engine is None and read_ext in (".xlsx", ".xlsm"):
 | 
						|
            # GH 35029
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(reason="Defaults to openpyxl, maybe not supported")
 | 
						|
            )
 | 
						|
 | 
						|
        result = pd.read_excel("testdateoverflow" + read_ext)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_sheet_name(self, request, read_ext, df_ref):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
        filename = "test1"
 | 
						|
        sheet_name = "Sheet1"
 | 
						|
 | 
						|
        df1 = pd.read_excel(
 | 
						|
            filename + read_ext, sheet_name=sheet_name, index_col=0
 | 
						|
        )  # doc
 | 
						|
        df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name)
 | 
						|
 | 
						|
        tm.assert_frame_equal(df1, df_ref, check_names=False)
 | 
						|
        tm.assert_frame_equal(df2, df_ref, check_names=False)
 | 
						|
 | 
						|
    def test_excel_read_buffer(self, read_ext):
 | 
						|
 | 
						|
        pth = "test1" + read_ext
 | 
						|
        expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0)
 | 
						|
        with open(pth, "rb") as f:
 | 
						|
            actual = pd.read_excel(f, sheet_name="Sheet1", index_col=0)
 | 
						|
            tm.assert_frame_equal(expected, actual)
 | 
						|
 | 
						|
    def test_bad_engine_raises(self, read_ext):
 | 
						|
        bad_engine = "foo"
 | 
						|
        with pytest.raises(ValueError, match="Unknown engine: foo"):
 | 
						|
            pd.read_excel("", engine=bad_engine)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "sheet_name",
 | 
						|
        [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]],
 | 
						|
    )
 | 
						|
    def test_bad_sheetname_raises(self, read_ext, sheet_name):
 | 
						|
        # GH 39250
 | 
						|
        msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            pd.read_excel("blank" + read_ext, sheet_name=sheet_name)
 | 
						|
 | 
						|
    def test_missing_file_raises(self, read_ext):
 | 
						|
        bad_file = f"foo{read_ext}"
 | 
						|
        # CI tests with other languages, translates to "No such file or directory"
 | 
						|
        match = r"(No such file or directory|没有那个文件或目录|File o directory non esistente)"
 | 
						|
        with pytest.raises(FileNotFoundError, match=match):
 | 
						|
            pd.read_excel(bad_file)
 | 
						|
 | 
						|
    def test_corrupt_bytes_raises(self, read_ext, engine):
 | 
						|
        bad_stream = b"foo"
 | 
						|
        if engine is None:
 | 
						|
            error = ValueError
 | 
						|
            msg = (
 | 
						|
                "Excel file format cannot be determined, you must "
 | 
						|
                "specify an engine manually."
 | 
						|
            )
 | 
						|
        elif engine == "xlrd":
 | 
						|
            from xlrd import XLRDError
 | 
						|
 | 
						|
            error = XLRDError
 | 
						|
            msg = (
 | 
						|
                "Unsupported format, or corrupt file: Expected BOF "
 | 
						|
                "record; found b'foo'"
 | 
						|
            )
 | 
						|
        else:
 | 
						|
            error = BadZipFile
 | 
						|
            msg = "File is not a zip file"
 | 
						|
        with pytest.raises(error, match=msg):
 | 
						|
            pd.read_excel(bad_stream)
 | 
						|
 | 
						|
    @pytest.mark.network
 | 
						|
    @tm.network(
 | 
						|
        url=(
 | 
						|
            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
 | 
						|
            "pandas/tests/io/data/excel/test1.xlsx"
 | 
						|
        ),
 | 
						|
        check_before_test=True,
 | 
						|
    )
 | 
						|
    def test_read_from_http_url(self, read_ext):
 | 
						|
        url = (
 | 
						|
            "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
 | 
						|
            "pandas/tests/io/data/excel/test1" + read_ext
 | 
						|
        )
 | 
						|
        url_table = pd.read_excel(url)
 | 
						|
        local_table = pd.read_excel("test1" + read_ext)
 | 
						|
        tm.assert_frame_equal(url_table, local_table)
 | 
						|
 | 
						|
    @td.skip_if_not_us_locale
 | 
						|
    @pytest.mark.single_cpu
 | 
						|
    def test_read_from_s3_url(self, read_ext, s3_resource, s3so):
 | 
						|
        # Bucket "pandas-test" created in tests/io/conftest.py
 | 
						|
        with open("test1" + read_ext, "rb") as f:
 | 
						|
            s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
 | 
						|
 | 
						|
        url = "s3://pandas-test/test1" + read_ext
 | 
						|
 | 
						|
        url_table = pd.read_excel(url, storage_options=s3so)
 | 
						|
        local_table = pd.read_excel("test1" + read_ext)
 | 
						|
        tm.assert_frame_equal(url_table, local_table)
 | 
						|
 | 
						|
    @pytest.mark.single_cpu
 | 
						|
    def test_read_from_s3_object(self, read_ext, s3_resource, s3so):
 | 
						|
        # GH 38788
 | 
						|
        # Bucket "pandas-test" created in tests/io/conftest.py
 | 
						|
        with open("test1" + read_ext, "rb") as f:
 | 
						|
            s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
 | 
						|
 | 
						|
        import s3fs
 | 
						|
 | 
						|
        s3 = s3fs.S3FileSystem(**s3so)
 | 
						|
 | 
						|
        with s3.open("s3://pandas-test/test1" + read_ext) as f:
 | 
						|
            url_table = pd.read_excel(f)
 | 
						|
 | 
						|
        local_table = pd.read_excel("test1" + read_ext)
 | 
						|
        tm.assert_frame_equal(url_table, local_table)
 | 
						|
 | 
						|
    @pytest.mark.slow
 | 
						|
    def test_read_from_file_url(self, read_ext, datapath):
 | 
						|
 | 
						|
        # FILE
 | 
						|
        localtable = os.path.join(datapath("io", "data", "excel"), "test1" + read_ext)
 | 
						|
        local_table = pd.read_excel(localtable)
 | 
						|
 | 
						|
        try:
 | 
						|
            url_table = pd.read_excel("file://localhost/" + localtable)
 | 
						|
        except URLError:
 | 
						|
            # fails on some systems
 | 
						|
            import platform
 | 
						|
 | 
						|
            platform_info = " ".join(platform.uname()).strip()
 | 
						|
            pytest.skip(f"failing on {platform_info}")
 | 
						|
 | 
						|
        tm.assert_frame_equal(url_table, local_table)
 | 
						|
 | 
						|
    def test_read_from_pathlib_path(self, read_ext):
 | 
						|
 | 
						|
        # GH12655
 | 
						|
        from pathlib import Path
 | 
						|
 | 
						|
        str_path = "test1" + read_ext
 | 
						|
        expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
 | 
						|
 | 
						|
        path_obj = Path("test1" + read_ext)
 | 
						|
        actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
 | 
						|
 | 
						|
        tm.assert_frame_equal(expected, actual)
 | 
						|
 | 
						|
    @td.skip_if_no("py.path")
 | 
						|
    @td.check_file_leaks
 | 
						|
    def test_read_from_py_localpath(self, read_ext):
 | 
						|
 | 
						|
        # GH12655
 | 
						|
        from py.path import local as LocalPath
 | 
						|
 | 
						|
        str_path = os.path.join("test1" + read_ext)
 | 
						|
        expected = pd.read_excel(str_path, sheet_name="Sheet1", index_col=0)
 | 
						|
 | 
						|
        path_obj = LocalPath().join("test1" + read_ext)
 | 
						|
        actual = pd.read_excel(path_obj, sheet_name="Sheet1", index_col=0)
 | 
						|
 | 
						|
        tm.assert_frame_equal(expected, actual)
 | 
						|
 | 
						|
    @td.check_file_leaks
 | 
						|
    def test_close_from_py_localpath(self, read_ext):
 | 
						|
 | 
						|
        # GH31467
 | 
						|
        str_path = os.path.join("test1" + read_ext)
 | 
						|
        with open(str_path, "rb") as f:
 | 
						|
            x = pd.read_excel(f, sheet_name="Sheet1", index_col=0)
 | 
						|
            del x
 | 
						|
            # should not throw an exception because the passed file was closed
 | 
						|
            f.read()
 | 
						|
 | 
						|
    def test_reader_seconds(self, request, engine, read_ext):
 | 
						|
        if engine == "pyxlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        # Test reading times with and without milliseconds. GH5945.
 | 
						|
        expected = DataFrame.from_dict(
 | 
						|
            {
 | 
						|
                "Time": [
 | 
						|
                    time(1, 2, 3),
 | 
						|
                    time(2, 45, 56, 100000),
 | 
						|
                    time(4, 29, 49, 200000),
 | 
						|
                    time(6, 13, 42, 300000),
 | 
						|
                    time(7, 57, 35, 400000),
 | 
						|
                    time(9, 41, 28, 500000),
 | 
						|
                    time(11, 25, 21, 600000),
 | 
						|
                    time(13, 9, 14, 700000),
 | 
						|
                    time(14, 53, 7, 800000),
 | 
						|
                    time(16, 37, 0, 900000),
 | 
						|
                    time(18, 20, 54),
 | 
						|
                ]
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        actual = pd.read_excel("times_1900" + read_ext, sheet_name="Sheet1")
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        actual = pd.read_excel("times_1904" + read_ext, sheet_name="Sheet1")
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    def test_read_excel_multiindex(self, request, read_ext):
 | 
						|
        # see gh-4679
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
 | 
						|
        mi_file = "testmultiindex" + read_ext
 | 
						|
 | 
						|
        # "mi_column" sheet
 | 
						|
        expected = DataFrame(
 | 
						|
            [
 | 
						|
                [1, 2.5, pd.Timestamp("2015-01-01"), True],
 | 
						|
                [2, 3.5, pd.Timestamp("2015-01-02"), False],
 | 
						|
                [3, 4.5, pd.Timestamp("2015-01-03"), False],
 | 
						|
                [4, 5.5, pd.Timestamp("2015-01-04"), True],
 | 
						|
            ],
 | 
						|
            columns=mi,
 | 
						|
        )
 | 
						|
 | 
						|
        actual = pd.read_excel(
 | 
						|
            mi_file, sheet_name="mi_column", header=[0, 1], index_col=0
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        # "mi_index" sheet
 | 
						|
        expected.index = mi
 | 
						|
        expected.columns = ["a", "b", "c", "d"]
 | 
						|
 | 
						|
        actual = pd.read_excel(mi_file, sheet_name="mi_index", index_col=[0, 1])
 | 
						|
        tm.assert_frame_equal(actual, expected, check_names=False)
 | 
						|
 | 
						|
        # "both" sheet
 | 
						|
        expected.columns = mi
 | 
						|
 | 
						|
        actual = pd.read_excel(
 | 
						|
            mi_file, sheet_name="both", index_col=[0, 1], header=[0, 1]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected, check_names=False)
 | 
						|
 | 
						|
        # "mi_index_name" sheet
 | 
						|
        expected.columns = ["a", "b", "c", "d"]
 | 
						|
        expected.index = mi.set_names(["ilvl1", "ilvl2"])
 | 
						|
 | 
						|
        actual = pd.read_excel(mi_file, sheet_name="mi_index_name", index_col=[0, 1])
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        # "mi_column_name" sheet
 | 
						|
        expected.index = list(range(4))
 | 
						|
        expected.columns = mi.set_names(["c1", "c2"])
 | 
						|
        actual = pd.read_excel(
 | 
						|
            mi_file, sheet_name="mi_column_name", header=[0, 1], index_col=0
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        # see gh-11317
 | 
						|
        # "name_with_int" sheet
 | 
						|
        expected.columns = mi.set_levels([1, 2], level=1).set_names(["c1", "c2"])
 | 
						|
 | 
						|
        actual = pd.read_excel(
 | 
						|
            mi_file, sheet_name="name_with_int", index_col=0, header=[0, 1]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        # "both_name" sheet
 | 
						|
        expected.columns = mi.set_names(["c1", "c2"])
 | 
						|
        expected.index = mi.set_names(["ilvl1", "ilvl2"])
 | 
						|
 | 
						|
        actual = pd.read_excel(
 | 
						|
            mi_file, sheet_name="both_name", index_col=[0, 1], header=[0, 1]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        # "both_skiprows" sheet
 | 
						|
        actual = pd.read_excel(
 | 
						|
            mi_file,
 | 
						|
            sheet_name="both_name_skiprows",
 | 
						|
            index_col=[0, 1],
 | 
						|
            header=[0, 1],
 | 
						|
            skiprows=2,
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "sheet_name,idx_lvl2",
 | 
						|
        [
 | 
						|
            ("both_name_blank_after_mi_name", [np.nan, "b", "a", "b"]),
 | 
						|
            ("both_name_multiple_blanks", [np.nan] * 4),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_read_excel_multiindex_blank_after_name(
 | 
						|
        self, request, read_ext, sheet_name, idx_lvl2
 | 
						|
    ):
 | 
						|
        # GH34673
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb (GH4679"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        mi_file = "testmultiindex" + read_ext
 | 
						|
        mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"])
 | 
						|
        expected = DataFrame(
 | 
						|
            [
 | 
						|
                [1, 2.5, pd.Timestamp("2015-01-01"), True],
 | 
						|
                [2, 3.5, pd.Timestamp("2015-01-02"), False],
 | 
						|
                [3, 4.5, pd.Timestamp("2015-01-03"), False],
 | 
						|
                [4, 5.5, pd.Timestamp("2015-01-04"), True],
 | 
						|
            ],
 | 
						|
            columns=mi,
 | 
						|
            index=MultiIndex.from_arrays(
 | 
						|
                (["foo", "foo", "bar", "bar"], idx_lvl2),
 | 
						|
                names=["ilvl1", "ilvl2"],
 | 
						|
            ),
 | 
						|
        )
 | 
						|
        result = pd.read_excel(
 | 
						|
            mi_file,
 | 
						|
            sheet_name=sheet_name,
 | 
						|
            index_col=[0, 1],
 | 
						|
            header=[0, 1],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_read_excel_multiindex_header_only(self, read_ext):
 | 
						|
        # see gh-11733.
 | 
						|
        #
 | 
						|
        # Don't try to parse a header name if there isn't one.
 | 
						|
        mi_file = "testmultiindex" + read_ext
 | 
						|
        result = pd.read_excel(mi_file, sheet_name="index_col_none", header=[0, 1])
 | 
						|
 | 
						|
        exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
 | 
						|
        expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_excel_old_index_format(self, read_ext):
 | 
						|
        # see gh-4679
 | 
						|
        filename = "test_index_name_pre17" + read_ext
 | 
						|
 | 
						|
        # We detect headers to determine if index names exist, so
 | 
						|
        # that "index" name in the "names" version of the data will
 | 
						|
        # now be interpreted as rows that include null data.
 | 
						|
        data = np.array(
 | 
						|
            [
 | 
						|
                [None, None, None, None, None],
 | 
						|
                ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
 | 
						|
                ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
 | 
						|
                ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
 | 
						|
                ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
 | 
						|
                ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
 | 
						|
            ]
 | 
						|
        )
 | 
						|
        columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
 | 
						|
        mi = MultiIndex(
 | 
						|
            levels=[
 | 
						|
                ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
 | 
						|
                ["R1", "R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
 | 
						|
            ],
 | 
						|
            codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
 | 
						|
            names=[None, None],
 | 
						|
        )
 | 
						|
        si = Index(
 | 
						|
            ["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None
 | 
						|
        )
 | 
						|
 | 
						|
        expected = DataFrame(data, index=si, columns=columns)
 | 
						|
 | 
						|
        actual = pd.read_excel(filename, sheet_name="single_names", index_col=0)
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        expected.index = mi
 | 
						|
 | 
						|
        actual = pd.read_excel(filename, sheet_name="multi_names", index_col=[0, 1])
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        # The analogous versions of the "names" version data
 | 
						|
        # where there are explicitly no names for the indices.
 | 
						|
        data = np.array(
 | 
						|
            [
 | 
						|
                ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
 | 
						|
                ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
 | 
						|
                ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
 | 
						|
                ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
 | 
						|
                ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"],
 | 
						|
            ]
 | 
						|
        )
 | 
						|
        columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
 | 
						|
        mi = MultiIndex(
 | 
						|
            levels=[
 | 
						|
                ["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"],
 | 
						|
                ["R_l1_g0", "R_l1_g1", "R_l1_g2", "R_l1_g3", "R_l1_g4"],
 | 
						|
            ],
 | 
						|
            codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
 | 
						|
            names=[None, None],
 | 
						|
        )
 | 
						|
        si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", "R_l0_g3", "R_l0_g4"], name=None)
 | 
						|
 | 
						|
        expected = DataFrame(data, index=si, columns=columns)
 | 
						|
 | 
						|
        actual = pd.read_excel(filename, sheet_name="single_no_names", index_col=0)
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        expected.index = mi
 | 
						|
 | 
						|
        actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1])
 | 
						|
        tm.assert_frame_equal(actual, expected, check_names=False)
 | 
						|
 | 
						|
    def test_read_excel_bool_header_arg(self, read_ext):
 | 
						|
        # GH 6114
 | 
						|
        msg = "Passing a bool to header is invalid"
 | 
						|
        for arg in [True, False]:
 | 
						|
            with pytest.raises(TypeError, match=msg):
 | 
						|
                pd.read_excel("test1" + read_ext, header=arg)
 | 
						|
 | 
						|
    def test_read_excel_skiprows(self, request, read_ext):
 | 
						|
        # GH 4903
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        actual = pd.read_excel(
 | 
						|
            "testskiprows" + read_ext, sheet_name="skiprows_list", skiprows=[0, 2]
 | 
						|
        )
 | 
						|
        expected = DataFrame(
 | 
						|
            [
 | 
						|
                [1, 2.5, pd.Timestamp("2015-01-01"), True],
 | 
						|
                [2, 3.5, pd.Timestamp("2015-01-02"), False],
 | 
						|
                [3, 4.5, pd.Timestamp("2015-01-03"), False],
 | 
						|
                [4, 5.5, pd.Timestamp("2015-01-04"), True],
 | 
						|
            ],
 | 
						|
            columns=["a", "b", "c", "d"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        actual = pd.read_excel(
 | 
						|
            "testskiprows" + read_ext,
 | 
						|
            sheet_name="skiprows_list",
 | 
						|
            skiprows=np.array([0, 2]),
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        # GH36435
 | 
						|
        actual = pd.read_excel(
 | 
						|
            "testskiprows" + read_ext,
 | 
						|
            sheet_name="skiprows_list",
 | 
						|
            skiprows=lambda x: x in [0, 2],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
        actual = pd.read_excel(
 | 
						|
            "testskiprows" + read_ext,
 | 
						|
            sheet_name="skiprows_list",
 | 
						|
            skiprows=3,
 | 
						|
            names=["a", "b", "c", "d"],
 | 
						|
        )
 | 
						|
        expected = DataFrame(
 | 
						|
            [
 | 
						|
                # [1, 2.5, pd.Timestamp("2015-01-01"), True],
 | 
						|
                [2, 3.5, pd.Timestamp("2015-01-02"), False],
 | 
						|
                [3, 4.5, pd.Timestamp("2015-01-03"), False],
 | 
						|
                [4, 5.5, pd.Timestamp("2015-01-04"), True],
 | 
						|
            ],
 | 
						|
            columns=["a", "b", "c", "d"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    def test_read_excel_nrows(self, read_ext):
 | 
						|
        # GH 16645
 | 
						|
        num_rows_to_pull = 5
 | 
						|
        actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
 | 
						|
        expected = pd.read_excel("test1" + read_ext)
 | 
						|
        expected = expected[:num_rows_to_pull]
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    def test_read_excel_nrows_greater_than_nrows_in_file(self, read_ext):
 | 
						|
        # GH 16645
 | 
						|
        expected = pd.read_excel("test1" + read_ext)
 | 
						|
        num_records_in_file = len(expected)
 | 
						|
        num_rows_to_pull = num_records_in_file + 10
 | 
						|
        actual = pd.read_excel("test1" + read_ext, nrows=num_rows_to_pull)
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    def test_read_excel_nrows_non_integer_parameter(self, read_ext):
 | 
						|
        # GH 16645
 | 
						|
        msg = "'nrows' must be an integer >=0"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            pd.read_excel("test1" + read_ext, nrows="5")
 | 
						|
 | 
						|
    def test_read_excel_squeeze(self, read_ext):
 | 
						|
        # GH 12157
 | 
						|
        f = "test_squeeze" + read_ext
 | 
						|
 | 
						|
        with tm.assert_produces_warning(
 | 
						|
            FutureWarning,
 | 
						|
            match="The squeeze argument has been deprecated "
 | 
						|
            "and will be removed in a future version. "
 | 
						|
            'Append .squeeze\\("columns"\\) to the call to squeeze.\n\n',
 | 
						|
        ):
 | 
						|
            actual = pd.read_excel(
 | 
						|
                f, sheet_name="two_columns", index_col=0, squeeze=True
 | 
						|
            )
 | 
						|
            expected = Series([2, 3, 4], [4, 5, 6], name="b")
 | 
						|
            expected.index.name = "a"
 | 
						|
            tm.assert_series_equal(actual, expected)
 | 
						|
 | 
						|
            actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
 | 
						|
            expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
 | 
						|
            tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
            actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
 | 
						|
            expected = Series([1, 2, 3], name="a")
 | 
						|
            tm.assert_series_equal(actual, expected)
 | 
						|
 | 
						|
    def test_deprecated_kwargs(self, read_ext):
 | 
						|
        with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
 | 
						|
            pd.read_excel("test1" + read_ext, "Sheet1", 0)
 | 
						|
 | 
						|
        pd.read_excel("test1" + read_ext)
 | 
						|
 | 
						|
    def test_no_header_with_list_index_col(self, read_ext):
 | 
						|
        # GH 31783
 | 
						|
        file_name = "testmultiindex" + read_ext
 | 
						|
        data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)]
 | 
						|
        idx = MultiIndex.from_tuples(
 | 
						|
            [("A", "A"), ("key", "val"), (1, 2), (1, 2)], names=(0, 1)
 | 
						|
        )
 | 
						|
        expected = DataFrame(data, index=idx, columns=(2, 3))
 | 
						|
        result = pd.read_excel(
 | 
						|
            file_name, sheet_name="index_col_none", index_col=[0, 1], header=None
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
    def test_one_col_noskip_blank_line(self, read_ext):
 | 
						|
        # GH 39808
 | 
						|
        file_name = "one_col_blank_line" + read_ext
 | 
						|
        data = [0.5, np.nan, 1, 2]
 | 
						|
        expected = DataFrame(data, columns=["numbers"])
 | 
						|
        result = pd.read_excel(file_name)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_multiheader_two_blank_lines(self, read_ext):
 | 
						|
        # GH 40442
 | 
						|
        file_name = "testmultiindex" + read_ext
 | 
						|
        columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
 | 
						|
        data = [[np.nan, np.nan], [np.nan, np.nan], [1, 3], [2, 4]]
 | 
						|
        expected = DataFrame(data, columns=columns)
 | 
						|
        result = pd.read_excel(
 | 
						|
            file_name, sheet_name="mi_column_empty_rows", header=[0, 1]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_trailing_blanks(self, read_ext):
 | 
						|
        """
 | 
						|
        Sheets can contain blank cells with no data. Some of our readers
 | 
						|
        were including those cells, creating many empty rows and columns
 | 
						|
        """
 | 
						|
        file_name = "trailing_blanks" + read_ext
 | 
						|
        result = pd.read_excel(file_name)
 | 
						|
        assert result.shape == (3, 3)
 | 
						|
 | 
						|
    def test_ignore_chartsheets_by_str(self, request, engine, read_ext):
 | 
						|
        # GH 41448
 | 
						|
        if engine == "odf":
 | 
						|
            pytest.skip("chartsheets do not exist in the ODF format")
 | 
						|
        if engine == "pyxlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="pyxlsb can't distinguish chartsheets from worksheets"
 | 
						|
                )
 | 
						|
            )
 | 
						|
        with pytest.raises(ValueError, match="Worksheet named 'Chart1' not found"):
 | 
						|
            pd.read_excel("chartsheet" + read_ext, sheet_name="Chart1")
 | 
						|
 | 
						|
    def test_ignore_chartsheets_by_int(self, request, engine, read_ext):
 | 
						|
        # GH 41448
 | 
						|
        if engine == "odf":
 | 
						|
            pytest.skip("chartsheets do not exist in the ODF format")
 | 
						|
        if engine == "pyxlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="pyxlsb can't distinguish chartsheets from worksheets"
 | 
						|
                )
 | 
						|
            )
 | 
						|
        with pytest.raises(
 | 
						|
            ValueError, match="Worksheet index 1 is invalid, 1 worksheets found"
 | 
						|
        ):
 | 
						|
            pd.read_excel("chartsheet" + read_ext, sheet_name=1)
 | 
						|
 | 
						|
    def test_euro_decimal_format(self, request, read_ext):
 | 
						|
        # copied from read_csv
 | 
						|
        result = pd.read_excel("test_decimal" + read_ext, decimal=",", skiprows=1)
 | 
						|
        expected = DataFrame(
 | 
						|
            [
 | 
						|
                [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819],
 | 
						|
                [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872],
 | 
						|
                [3, 878.158, 108013.434, "GHI", "rez", 2.735694704],
 | 
						|
            ],
 | 
						|
            columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
class TestExcelFileRead:
 | 
						|
    @pytest.fixture(autouse=True)
 | 
						|
    def cd_and_set_engine(self, engine, datapath, monkeypatch):
 | 
						|
        """
 | 
						|
        Change directory and set engine for ExcelFile objects.
 | 
						|
        """
 | 
						|
        func = partial(pd.ExcelFile, engine=engine)
 | 
						|
        monkeypatch.chdir(datapath("io", "data", "excel"))
 | 
						|
        monkeypatch.setattr(pd, "ExcelFile", func)
 | 
						|
 | 
						|
    def test_engine_used(self, read_ext, engine, monkeypatch):
 | 
						|
        expected_defaults = {
 | 
						|
            "xlsx": "openpyxl",
 | 
						|
            "xlsm": "openpyxl",
 | 
						|
            "xlsb": "pyxlsb",
 | 
						|
            "xls": "xlrd",
 | 
						|
            "ods": "odf",
 | 
						|
        }
 | 
						|
 | 
						|
        with pd.ExcelFile("test1" + read_ext) as excel:
 | 
						|
            result = excel.engine
 | 
						|
 | 
						|
        if engine is not None:
 | 
						|
            expected = engine
 | 
						|
        else:
 | 
						|
            expected = expected_defaults[read_ext[1:]]
 | 
						|
        assert result == expected
 | 
						|
 | 
						|
    def test_excel_passes_na(self, read_ext):
 | 
						|
        with pd.ExcelFile("test4" + read_ext) as excel:
 | 
						|
            parsed = pd.read_excel(
 | 
						|
                excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"]
 | 
						|
            )
 | 
						|
        expected = DataFrame(
 | 
						|
            [["NA"], [1], ["NA"], [np.nan], ["rabbit"]], columns=["Test"]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(parsed, expected)
 | 
						|
 | 
						|
        with pd.ExcelFile("test4" + read_ext) as excel:
 | 
						|
            parsed = pd.read_excel(
 | 
						|
                excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"]
 | 
						|
            )
 | 
						|
        expected = DataFrame(
 | 
						|
            [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(parsed, expected)
 | 
						|
 | 
						|
        # 13967
 | 
						|
        with pd.ExcelFile("test5" + read_ext) as excel:
 | 
						|
            parsed = pd.read_excel(
 | 
						|
                excel, sheet_name="Sheet1", keep_default_na=False, na_values=["apple"]
 | 
						|
            )
 | 
						|
        expected = DataFrame(
 | 
						|
            [["1.#QNAN"], [1], ["nan"], [np.nan], ["rabbit"]], columns=["Test"]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(parsed, expected)
 | 
						|
 | 
						|
        with pd.ExcelFile("test5" + read_ext) as excel:
 | 
						|
            parsed = pd.read_excel(
 | 
						|
                excel, sheet_name="Sheet1", keep_default_na=True, na_values=["apple"]
 | 
						|
            )
 | 
						|
        expected = DataFrame(
 | 
						|
            [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]], columns=["Test"]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(parsed, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("na_filter", [None, True, False])
 | 
						|
    def test_excel_passes_na_filter(self, read_ext, na_filter):
 | 
						|
        # gh-25453
 | 
						|
        kwargs = {}
 | 
						|
 | 
						|
        if na_filter is not None:
 | 
						|
            kwargs["na_filter"] = na_filter
 | 
						|
 | 
						|
        with pd.ExcelFile("test5" + read_ext) as excel:
 | 
						|
            parsed = pd.read_excel(
 | 
						|
                excel,
 | 
						|
                sheet_name="Sheet1",
 | 
						|
                keep_default_na=True,
 | 
						|
                na_values=["apple"],
 | 
						|
                **kwargs,
 | 
						|
            )
 | 
						|
 | 
						|
        if na_filter is False:
 | 
						|
            expected = [["1.#QNAN"], [1], ["nan"], ["apple"], ["rabbit"]]
 | 
						|
        else:
 | 
						|
            expected = [[np.nan], [1], [np.nan], [np.nan], ["rabbit"]]
 | 
						|
 | 
						|
        expected = DataFrame(expected, columns=["Test"])
 | 
						|
        tm.assert_frame_equal(parsed, expected)
 | 
						|
 | 
						|
    def test_excel_table_sheet_by_index(self, request, read_ext, df_ref):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        with pd.ExcelFile("test1" + read_ext) as excel:
 | 
						|
            df1 = pd.read_excel(excel, sheet_name=0, index_col=0)
 | 
						|
            df2 = pd.read_excel(excel, sheet_name=1, skiprows=[1], index_col=0)
 | 
						|
        tm.assert_frame_equal(df1, df_ref, check_names=False)
 | 
						|
        tm.assert_frame_equal(df2, df_ref, check_names=False)
 | 
						|
 | 
						|
        with pd.ExcelFile("test1" + read_ext) as excel:
 | 
						|
            df1 = excel.parse(0, index_col=0)
 | 
						|
            df2 = excel.parse(1, skiprows=[1], index_col=0)
 | 
						|
        tm.assert_frame_equal(df1, df_ref, check_names=False)
 | 
						|
        tm.assert_frame_equal(df2, df_ref, check_names=False)
 | 
						|
 | 
						|
        with pd.ExcelFile("test1" + read_ext) as excel:
 | 
						|
            df3 = pd.read_excel(excel, sheet_name=0, index_col=0, skipfooter=1)
 | 
						|
        tm.assert_frame_equal(df3, df1.iloc[:-1])
 | 
						|
 | 
						|
        with pd.ExcelFile("test1" + read_ext) as excel:
 | 
						|
            df3 = excel.parse(0, index_col=0, skipfooter=1)
 | 
						|
 | 
						|
        tm.assert_frame_equal(df3, df1.iloc[:-1])
 | 
						|
 | 
						|
    def test_sheet_name(self, request, read_ext, df_ref):
 | 
						|
        if read_ext == ".xlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        filename = "test1"
 | 
						|
        sheet_name = "Sheet1"
 | 
						|
 | 
						|
        with pd.ExcelFile(filename + read_ext) as excel:
 | 
						|
            df1_parse = excel.parse(sheet_name=sheet_name, index_col=0)  # doc
 | 
						|
 | 
						|
        with pd.ExcelFile(filename + read_ext) as excel:
 | 
						|
            df2_parse = excel.parse(index_col=0, sheet_name=sheet_name)
 | 
						|
 | 
						|
        tm.assert_frame_equal(df1_parse, df_ref, check_names=False)
 | 
						|
        tm.assert_frame_equal(df2_parse, df_ref, check_names=False)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "sheet_name",
 | 
						|
        [3, [0, 3], [3, 0], "Sheet4", ["Sheet1", "Sheet4"], ["Sheet4", "Sheet1"]],
 | 
						|
    )
 | 
						|
    def test_bad_sheetname_raises(self, read_ext, sheet_name):
 | 
						|
        # GH 39250
 | 
						|
        msg = "Worksheet index 3 is invalid|Worksheet named 'Sheet4' not found"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            with pd.ExcelFile("blank" + read_ext) as excel:
 | 
						|
                excel.parse(sheet_name=sheet_name)
 | 
						|
 | 
						|
    def test_excel_read_buffer(self, engine, read_ext):
 | 
						|
        pth = "test1" + read_ext
 | 
						|
        expected = pd.read_excel(pth, sheet_name="Sheet1", index_col=0, engine=engine)
 | 
						|
 | 
						|
        with open(pth, "rb") as f:
 | 
						|
            with pd.ExcelFile(f) as xls:
 | 
						|
                actual = pd.read_excel(xls, sheet_name="Sheet1", index_col=0)
 | 
						|
 | 
						|
        tm.assert_frame_equal(expected, actual)
 | 
						|
 | 
						|
    def test_reader_closes_file(self, engine, read_ext):
 | 
						|
        with open("test1" + read_ext, "rb") as f:
 | 
						|
            with pd.ExcelFile(f) as xlsx:
 | 
						|
                # parses okay
 | 
						|
                pd.read_excel(xlsx, sheet_name="Sheet1", index_col=0, engine=engine)
 | 
						|
 | 
						|
        assert f.closed
 | 
						|
 | 
						|
    def test_conflicting_excel_engines(self, read_ext):
 | 
						|
        # GH 26566
 | 
						|
        msg = "Engine should not be specified when passing an ExcelFile"
 | 
						|
 | 
						|
        with pd.ExcelFile("test1" + read_ext) as xl:
 | 
						|
            with pytest.raises(ValueError, match=msg):
 | 
						|
                pd.read_excel(xl, engine="foo")
 | 
						|
 | 
						|
    def test_excel_read_binary(self, engine, read_ext):
 | 
						|
        # GH 15914
 | 
						|
        expected = pd.read_excel("test1" + read_ext, engine=engine)
 | 
						|
 | 
						|
        with open("test1" + read_ext, "rb") as f:
 | 
						|
            data = f.read()
 | 
						|
 | 
						|
        actual = pd.read_excel(data, engine=engine)
 | 
						|
        tm.assert_frame_equal(expected, actual)
 | 
						|
 | 
						|
    def test_excel_read_binary_via_read_excel(self, read_ext, engine):
 | 
						|
        # GH 38424
 | 
						|
        with open("test1" + read_ext, "rb") as f:
 | 
						|
            result = pd.read_excel(f)
 | 
						|
        expected = pd.read_excel("test1" + read_ext, engine=engine)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.skipif(
 | 
						|
        xlrd_version is not None and xlrd_version >= Version("2"),
 | 
						|
        reason="xlrd no longer supports xlsx",
 | 
						|
    )
 | 
						|
    def test_excel_high_surrogate(self):
 | 
						|
        # GH 23809
 | 
						|
        expected = DataFrame(["\udc88"], columns=["Column1"])
 | 
						|
 | 
						|
        # should not produce a segmentation violation
 | 
						|
        actual = pd.read_excel("high_surrogate.xlsx", engine="xlrd")
 | 
						|
        tm.assert_frame_equal(expected, actual)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"])
 | 
						|
    def test_header_with_index_col(self, filename):
 | 
						|
        # GH 33476
 | 
						|
        idx = Index(["Z"], name="I2")
 | 
						|
        cols = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"])
 | 
						|
        expected = DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64")
 | 
						|
        result = pd.read_excel(
 | 
						|
            filename, sheet_name="Sheet1", index_col=0, header=[0, 1]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
    def test_read_datetime_multiindex(self, request, engine, read_ext):
 | 
						|
        # GH 34748
 | 
						|
        if engine == "pyxlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="Sheets containing datetimes not supported by pyxlsb"
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        f = "test_datetime_mi" + read_ext
 | 
						|
        with pd.ExcelFile(f) as excel:
 | 
						|
            actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine)
 | 
						|
        expected_column_index = MultiIndex.from_tuples(
 | 
						|
            [(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))],
 | 
						|
            names=[
 | 
						|
                pd.to_datetime("02/29/2020").to_pydatetime(),
 | 
						|
                pd.to_datetime("03/01/2020").to_pydatetime(),
 | 
						|
            ],
 | 
						|
        )
 | 
						|
        expected = DataFrame([], columns=expected_column_index)
 | 
						|
 | 
						|
        tm.assert_frame_equal(expected, actual)
 | 
						|
 | 
						|
    def test_engine_invalid_option(self, read_ext):
 | 
						|
        # read_ext includes the '.' hence the weird formatting
 | 
						|
        with pytest.raises(ValueError, match="Value must be one of *"):
 | 
						|
            with pd.option_context(f"io.excel{read_ext}.reader", "abc"):
 | 
						|
                pass
 | 
						|
 | 
						|
    def test_ignore_chartsheets(self, request, engine, read_ext):
 | 
						|
        # GH 41448
 | 
						|
        if engine == "odf":
 | 
						|
            pytest.skip("chartsheets do not exist in the ODF format")
 | 
						|
        if engine == "pyxlsb":
 | 
						|
            request.node.add_marker(
 | 
						|
                pytest.mark.xfail(
 | 
						|
                    reason="pyxlsb can't distinguish chartsheets from worksheets"
 | 
						|
                )
 | 
						|
            )
 | 
						|
        with pd.ExcelFile("chartsheet" + read_ext) as excel:
 | 
						|
            assert excel.sheet_names == ["Sheet1"]
 | 
						|
 | 
						|
    def test_corrupt_files_closed(self, engine, read_ext):
 | 
						|
        # GH41778
 | 
						|
        errors = (BadZipFile,)
 | 
						|
        if engine is None:
 | 
						|
            pytest.skip()
 | 
						|
        elif engine == "xlrd":
 | 
						|
            import xlrd
 | 
						|
 | 
						|
            errors = (BadZipFile, xlrd.biffh.XLRDError)
 | 
						|
 | 
						|
        with tm.ensure_clean(f"corrupt{read_ext}") as file:
 | 
						|
            Path(file).write_text("corrupt")
 | 
						|
            with tm.assert_produces_warning(False):
 | 
						|
                try:
 | 
						|
                    pd.ExcelFile(file, engine=engine)
 | 
						|
                except errors:
 | 
						|
                    pass
 |