from __future__ import annotations
from io import (
    BytesIO,
    StringIO,
)
import os
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas import (
    NA,
    DataFrame,
    Index,
)
import pandas._testing as tm
import pandas.io.common as icom
from pandas.io.common import get_handle
from pandas.io.xml import read_xml
"""
CHECKLIST
[x] - ValueError: "Values for parser can only be lxml or etree."
etree
[x] - ImportError: "lxml not found, please install or use the etree parser."
[X] - TypeError: "...is not a valid type for attr_cols"
[X] - TypeError: "...is not a valid type for elem_cols"
[X] - LookupError: "unknown encoding"
[X] - KeyError: "...is not included in namespaces"
[X] - KeyError: "no valid column"
[X] - ValueError: "To use stylesheet, you need lxml installed..."
[]  - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.)
[X] - FileNotFoundError: "No such file or directory"
[X] - PermissionError: "Forbidden"
lxml
[X] - TypeError: "...is not a valid type for attr_cols"
[X] - TypeError: "...is not a valid type for elem_cols"
[X] - LookupError: "unknown encoding"
[]  - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.)
[X] - FileNotFoundError: "No such file or directory"
[X] - KeyError: "...is not included in namespaces"
[X] - KeyError: "no valid column"
[X] - ValueError: "stylesheet is not a url, file, or xml string."
[]  - LookupError: (NEED WRONG ENCODING FOR FILE OUTPUT)
[]  - URLError: (USUALLY DUE TO NETWORKING)
[]  - HTTPError: (NEED AN ONLINE STYLESHEET)
[X] - OSError: "failed to load external entity"
[X] - XMLSyntaxError: "Opening and ending tag mismatch"
[X] - XSLTApplyError: "Cannot resolve URI"
[X] - XSLTParseError: "failed to compile"
[X] - PermissionError: "Forbidden"
"""
geom_df = DataFrame(
    {
        "shape": ["square", "circle", "triangle"],
        "degrees": [360, 360, 180],
        "sides": [4, np.nan, 3],
    }
)
planet_df = DataFrame(
    {
        "planet": [
            "Mercury",
            "Venus",
            "Earth",
            "Mars",
            "Jupiter",
            "Saturn",
            "Uranus",
            "Neptune",
        ],
        "type": [
            "terrestrial",
            "terrestrial",
            "terrestrial",
            "terrestrial",
            "gas giant",
            "gas giant",
            "ice giant",
            "ice giant",
        ],
        "location": [
            "inner",
            "inner",
            "inner",
            "inner",
            "outer",
            "outer",
            "outer",
            "outer",
        ],
        "mass": [
            0.330114,
            4.86747,
            5.97237,
            0.641712,
            1898.187,
            568.3174,
            86.8127,
            102.4126,
        ],
    }
)
from_file_expected = """\
  
    0
    cooking
    Everyday Italian
    Giada De Laurentiis
    2005
    30.0
  
  
    1
    children
    Harry Potter
    J K. Rowling
    2005
    29.99
  
  
    2
    web
    Learning XML
    Erik T. Ray
    2003
    39.95
  
"""
def equalize_decl(doc):
    # etree and lxml differ on quotes and case in xml declaration
    if doc is not None:
        doc = doc.replace(
            '
  
    cooking
    Everyday Italian
    Giada De Laurentiis
    2005
    30.0
  
  
    children
    Harry Potter
    J K. Rowling
    2005
    29.99
  
  
    web
    Learning XML
    Erik T. Ray
    2003
    39.95
  
"""
    filename = datapath("io", "data", "xml", "books.xml")
    df_file = read_xml(filename, parser=parser)
    with tm.ensure_clean("test.xml") as path:
        df_file.to_xml(path, index=False, parser=parser)
        with open(path, "rb") as f:
            output = f.read().decode("utf-8").strip()
        output = equalize_decl(output)
        assert output == expected
def test_index_false_rename_row_root(datapath, parser):
    expected = """\
  
    cooking
    Everyday Italian
    Giada De Laurentiis
    2005
    30.0
  
  
    children
    Harry Potter
    J K. Rowling
    2005
    29.99
  
  
    web
    Learning XML
    Erik T. Ray
    2003
    39.95
  
"""
    filename = datapath("io", "data", "xml", "books.xml")
    df_file = read_xml(filename, parser=parser)
    with tm.ensure_clean("test.xml") as path:
        df_file.to_xml(
            path, index=False, root_name="books", row_name="book", parser=parser
        )
        with open(path, "rb") as f:
            output = f.read().decode("utf-8").strip()
        output = equalize_decl(output)
        assert output == expected
@pytest.mark.parametrize(
    "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]]
)
def test_index_false_with_offset_input_index(parser, offset_index):
    """
    Tests that the output does not contain the `` field when the index of the
    input Dataframe has an offset.
    This is a regression test for issue #42458.
    """
    expected = """\
  
    square
    360
    4.0
  
  
    circle
    360
    
  
  
    triangle
    180
    3.0
  
"""
    offset_geom_df = geom_df.copy()
    offset_geom_df.index = Index(offset_index)
    output = offset_geom_df.to_xml(index=False, parser=parser)
    output = equalize_decl(output)
    assert output == expected
# NA_REP
na_expected = """\
  
    0
    square
    360
    4.0
  
  
    1
    circle
    360
    
  
  
    2
    triangle
    180
    3.0
  
"""
def test_na_elem_output(datapath, parser):
    output = geom_df.to_xml(parser=parser)
    output = equalize_decl(output)
    assert output == na_expected
def test_na_empty_str_elem_option(datapath, parser):
    output = geom_df.to_xml(na_rep="", parser=parser)
    output = equalize_decl(output)
    assert output == na_expected
def test_na_empty_elem_option(datapath, parser):
    expected = """\
  
    0
    square
    360
    4.0
  
  
    1
    circle
    360
    0.0
  
  
    2
    triangle
    180
    3.0
  
"""
    output = geom_df.to_xml(na_rep="0.0", parser=parser)
    output = equalize_decl(output)
    assert output == expected
# ATTR_COLS
def test_attrs_cols_nan_output(datapath, parser):
    expected = """\
  
  
  
"""
    output = geom_df.to_xml(attr_cols=["shape", "degrees", "sides"], parser=parser)
    output = equalize_decl(output)
    assert output == expected
def test_attrs_cols_prefix(datapath, parser):
    expected = """\
  
  
  
"""
    output = geom_df.to_xml(
        attr_cols=["index", "shape", "degrees", "sides"],
        namespaces={"doc": "http://example.xom"},
        prefix="doc",
        parser=parser,
    )
    output = equalize_decl(output)
    assert output == expected
def test_attrs_unknown_column(parser):
    with pytest.raises(KeyError, match=("no valid column")):
        geom_df.to_xml(attr_cols=["shape", "degree", "sides"], parser=parser)
def test_attrs_wrong_type(parser):
    with pytest.raises(TypeError, match=("is not a valid type for attr_cols")):
        geom_df.to_xml(attr_cols='"shape", "degree", "sides"', parser=parser)
# ELEM_COLS
def test_elems_cols_nan_output(datapath, parser):
    elems_cols_expected = """\
  
    360
    4.0
    square
  
  
    360
    
    circle
  
  
    180
    3.0
    triangle
  
"""
    output = geom_df.to_xml(
        index=False, elem_cols=["degrees", "sides", "shape"], parser=parser
    )
    output = equalize_decl(output)
    assert output == elems_cols_expected
def test_elems_unknown_column(parser):
    with pytest.raises(KeyError, match=("no valid column")):
        geom_df.to_xml(elem_cols=["shape", "degree", "sides"], parser=parser)
def test_elems_wrong_type(parser):
    with pytest.raises(TypeError, match=("is not a valid type for elem_cols")):
        geom_df.to_xml(elem_cols='"shape", "degree", "sides"', parser=parser)
def test_elems_and_attrs_cols(datapath, parser):
    elems_cols_expected = """\
  
    360
    4.0
  
  
    360
    
  
  
    180
    3.0
  
"""
    output = geom_df.to_xml(
        index=False,
        elem_cols=["degrees", "sides"],
        attr_cols=["shape"],
        parser=parser,
    )
    output = equalize_decl(output)
    assert output == elems_cols_expected
# HIERARCHICAL COLUMNS
def test_hierarchical_columns(datapath, parser):
    expected = """\
  
    inner
    terrestrial
    4
    11.81
    2.95
  
  
    outer
    gas giant
    2
    2466.5
    1233.25
  
  
    outer
    ice giant
    2
    189.23
    94.61
  
  
    All
    
    8
    2667.54
    333.44
  
"""
    pvt = planet_df.pivot_table(
        index=["location", "type"],
        values="mass",
        aggfunc=["count", "sum", "mean"],
        margins=True,
    ).round(2)
    output = pvt.to_xml(parser=parser)
    output = equalize_decl(output)
    assert output == expected
def test_hierarchical_attrs_columns(datapath, parser):
    expected = """\
  
  
  
  
"""
    pvt = planet_df.pivot_table(
        index=["location", "type"],
        values="mass",
        aggfunc=["count", "sum", "mean"],
        margins=True,
    ).round(2)
    output = pvt.to_xml(attr_cols=list(pvt.reset_index().columns.values), parser=parser)
    output = equalize_decl(output)
    assert output == expected
# MULTIINDEX
def test_multi_index(datapath, parser):
    expected = """\
  
    inner
    terrestrial
    4
    11.81
    2.95
  
  
    outer
    gas giant
    2
    2466.5
    1233.25
  
  
    outer
    ice giant
    2
    189.23
    94.61
  
"""
    agg = (
        planet_df.groupby(["location", "type"])["mass"]
        .agg(["count", "sum", "mean"])
        .round(2)
    )
    output = agg.to_xml(parser=parser)
    output = equalize_decl(output)
    assert output == expected
def test_multi_index_attrs_cols(datapath, parser):
    expected = """\
  
  
  
"""
    agg = (
        planet_df.groupby(["location", "type"])["mass"]
        .agg(["count", "sum", "mean"])
        .round(2)
    )
    output = agg.to_xml(attr_cols=list(agg.reset_index().columns.values), parser=parser)
    output = equalize_decl(output)
    assert output == expected
# NAMESPACE
def test_default_namespace(parser):
    expected = """\
  
    0
    square
    360
    4.0
  
  
    1
    circle
    360
    
  
  
    2
    triangle
    180
    3.0
  
"""
    output = geom_df.to_xml(namespaces={"": "http://example.com"}, parser=parser)
    output = equalize_decl(output)
    assert output == expected
# PREFIX
def test_namespace_prefix(parser):
    expected = """\
  
    0
    square
    360
    4.0
  
  
    1
    circle
    360
    
  
  
    2
    triangle
    180
    3.0
  
"""
    output = geom_df.to_xml(
        namespaces={"doc": "http://example.com"}, prefix="doc", parser=parser
    )
    output = equalize_decl(output)
    assert output == expected
def test_missing_prefix_in_nmsp(parser):
    with pytest.raises(KeyError, match=("doc is not included in namespaces")):
        geom_df.to_xml(
            namespaces={"": "http://example.com"}, prefix="doc", parser=parser
        )
def test_namespace_prefix_and_default(parser):
    expected = """\
  
    0
    square
    360
    4.0
  
  
    1
    circle
    360
    
  
  
    2
    triangle
    180
    3.0
  
"""
    output = geom_df.to_xml(
        namespaces={"": "http://example.com", "doc": "http://other.org"},
        prefix="doc",
        parser=parser,
    )
    output = equalize_decl(output)
    if output is not None:
        # etree and lxml differs on order of namespace prefixes
        output = output.replace(
            'xmlns:doc="http://other.org" xmlns="http://example.com"',
            'xmlns="http://example.com" xmlns:doc="http://other.org"',
        )
    assert output == expected
# ENCODING
encoding_expected = """\
  
    0
    1
    José
    Sofía
  
  
    1
    2
    Luis
    Valentina
  
  
    2
    3
    Carlos
    Isabella
  
  
    3
    4
    Juan
    Camila
  
  
    4
    5
    Jorge
    Valeria
  
"""
def test_encoding_option_str(datapath, parser):
    filename = datapath("io", "data", "xml", "baby_names.xml")
    df_file = read_xml(filename, parser=parser, encoding="ISO-8859-1").head(5)
    output = df_file.to_xml(encoding="ISO-8859-1", parser=parser)
    if output is not None:
        # etree and lxml differ on quotes and case in xml declaration
        output = output.replace(
            '
  
    0
    square
    360
    4.0
  
  
    1
    circle
    360
    
  
  
    2
    triangle
    180
    3.0
  
"""
    output = geom_df.to_xml(xml_declaration=False)
    assert output == expected
def test_no_pretty_print_with_decl(parser):
    expected = (
        "\n"
        "0square"
        "3604.0
"
        "1circle360"
        "
2"
        "triangle1803.0"
        "
"
    )
    output = geom_df.to_xml(pretty_print=False, parser=parser)
    output = equalize_decl(output)
    # etree adds space for closed tags
    if output is not None:
        output = output.replace(" />", "/>")
    assert output == expected
def test_no_pretty_print_no_decl(parser):
    expected = (
        "0square"
        "3604.0
"
        "1circle360"
        "
2"
        "triangle1803.0"
        "
"
    )
    output = geom_df.to_xml(xml_declaration=False, pretty_print=False, parser=parser)
    # etree adds space for closed tags
    if output is not None:
        output = output.replace(" />", "/>")
    assert output == expected
# PARSER
@td.skip_if_installed("lxml")
def test_default_parser_no_lxml():
    with pytest.raises(
        ImportError, match=("lxml not found, please install or use the etree parser.")
    ):
        geom_df.to_xml()
def test_unknown_parser():
    with pytest.raises(
        ValueError, match=("Values for parser can only be lxml or etree.")
    ):
        geom_df.to_xml(parser="bs4")
# STYLESHEET
xsl_expected = """\
  
    0
    square
    360
    4.0
  
  
    1
    circle
    360
    
  
  
    2
    triangle
    180
    3.0
  
"""
@td.skip_if_no("lxml")
def test_stylesheet_file_like(datapath, mode):
    xsl = datapath("io", "data", "xml", "row_field_output.xsl")
    with open(xsl, mode) as f:
        assert geom_df.to_xml(stylesheet=f) == xsl_expected
@td.skip_if_no("lxml")
def test_stylesheet_io(datapath, mode):
    xsl_path = datapath("io", "data", "xml", "row_field_output.xsl")
    xsl_obj: BytesIO | StringIO
    with open(xsl_path, mode) as f:
        if mode == "rb":
            xsl_obj = BytesIO(f.read())
        else:
            xsl_obj = StringIO(f.read())
    output = geom_df.to_xml(stylesheet=xsl_obj)
    assert output == xsl_expected
@td.skip_if_no("lxml")
def test_stylesheet_buffered_reader(datapath, mode):
    xsl = datapath("io", "data", "xml", "row_field_output.xsl")
    with open(xsl, mode) as f:
        xsl_obj = f.read()
    output = geom_df.to_xml(stylesheet=xsl_obj)
    assert output == xsl_expected
@td.skip_if_no("lxml")
def test_stylesheet_wrong_path(datapath):
    from lxml.etree import XMLSyntaxError
    xsl = os.path.join("data", "xml", "row_field_output.xslt")
    with pytest.raises(
        XMLSyntaxError,
        match=("Start tag expected, '<' not found"),
    ):
        geom_df.to_xml(stylesheet=xsl)
@td.skip_if_no("lxml")
@pytest.mark.parametrize("val", ["", b""])
def test_empty_string_stylesheet(val):
    from lxml.etree import XMLSyntaxError
    with pytest.raises(
        XMLSyntaxError, match=("Document is empty|Start tag expected, '<' not found")
    ):
        geom_df.to_xml(stylesheet=val)
@td.skip_if_no("lxml")
def test_incorrect_xsl_syntax():
    from lxml.etree import XMLSyntaxError
    xsl = """\
    
    
    
        
            
        
    
    
        
            
                
            
            
        
    
"""
    with pytest.raises(XMLSyntaxError, match=("Opening and ending tag mismatch")):
        geom_df.to_xml(stylesheet=xsl)
@td.skip_if_no("lxml")
def test_incorrect_xsl_eval():
    from lxml.etree import XSLTParseError
    xsl = """\
    
    
    
        
            
        
    
    
        
            
                
            
            
        
    
"""
    with pytest.raises(XSLTParseError, match=("failed to compile")):
        geom_df.to_xml(stylesheet=xsl)
@td.skip_if_no("lxml")
def test_incorrect_xsl_apply(parser):
    from lxml.etree import XSLTApplyError
    xsl = """\
    
    
    
        
            
        
    
"""
    with pytest.raises(XSLTApplyError, match=("Cannot resolve URI")):
        with tm.ensure_clean("test.xml") as path:
            geom_df.to_xml(path, stylesheet=xsl)
def test_stylesheet_with_etree(datapath):
    xsl = """\
    
    
    
        
            
        
    """
    with pytest.raises(
        ValueError, match=("To use stylesheet, you need lxml installed")
    ):
        geom_df.to_xml(parser="etree", stylesheet=xsl)
@td.skip_if_no("lxml")
def test_style_to_csv():
    xsl = """\
    
    
    ,
    
        ,shape,degrees,sides
        
    
    
        
         
    
"""
    out_csv = geom_df.to_csv(line_terminator="\n")
    if out_csv is not None:
        out_csv = out_csv.strip()
    out_xml = geom_df.to_xml(stylesheet=xsl)
    assert out_csv == out_xml
@td.skip_if_no("lxml")
def test_style_to_string():
    xsl = """\
    
    
                   
    
              shape  degrees  sides
        
    
    
        
         
    
"""
    out_str = geom_df.to_string()
    out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl)
    assert out_xml == out_str
@td.skip_if_no("lxml")
def test_style_to_json():
    xsl = """\
    
    
    "
    
        {"shape":{
        
        },"degrees":{
        
        },"sides":{
        
        }}
    
    
        
            
                
            
            
                
            
            
                
            
        
        
        
            ,
        
    
"""
    out_json = geom_df.to_json()
    out_xml = geom_df.to_xml(stylesheet=xsl)
    assert out_json == out_xml
# COMPRESSION
geom_xml = """\
  
    0
    square
    360
    4.0
  
  
    1
    circle
    360
    
  
  
    2
    triangle
    180
    3.0
  
"""
def test_compression_output(parser, compression_only):
    with tm.ensure_clean() as path:
        geom_df.to_xml(path, parser=parser, compression=compression_only)
        with get_handle(
            path,
            "r",
            compression=compression_only,
        ) as handle_obj:
            output = handle_obj.handle.read()
    output = equalize_decl(output)
    assert geom_xml == output.strip()
def test_filename_and_suffix_comp(parser, compression_only):
    compfile = "xml." + icom._compression_to_extension[compression_only]
    with tm.ensure_clean(filename=compfile) as path:
        geom_df.to_xml(path, parser=parser, compression=compression_only)
        with get_handle(
            path,
            "r",
            compression=compression_only,
        ) as handle_obj:
            output = handle_obj.handle.read()
    output = equalize_decl(output)
    assert geom_xml == output.strip()
def test_ea_dtypes(any_numeric_ea_dtype, parser):
    # GH#43903
    expected = """
  
    0
    
  
"""
    df = DataFrame({"a": [NA]}).astype(any_numeric_ea_dtype)
    result = df.to_xml(parser=parser)
    assert equalize_decl(result).strip() == expected
def test_unsuported_compression(datapath, parser):
    with pytest.raises(ValueError, match="Unrecognized compression type"):
        with tm.ensure_clean() as path:
            geom_df.to_xml(path, parser=parser, compression="7z")
# STORAGE OPTIONS
@pytest.mark.single_cpu
@td.skip_if_no("s3fs")
@td.skip_if_no("lxml")
def test_s3_permission_output(parser, s3_resource):
    # s3_resource hosts pandas-test
    import s3fs
    with pytest.raises(PermissionError, match="Access Denied"):
        fs = s3fs.S3FileSystem(anon=True)
        fs.ls("pandas-test")
        geom_df.to_xml("s3://pandas-test/geom.xml", compression="zip", parser=parser)