Feature-Extraction/dist/client/pandas/tests/io/pytables/test_append.py

import datetime
from datetime import timedelta
import re
from warnings import catch_warnings

import numpy as np
import pytest

from pandas._libs.tslibs import Timestamp
import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
    DataFrame,
    Series,
    _testing as tm,
    concat,
    date_range,
    read_hdf,
)
from pandas.tests.io.pytables.common import (
    _maybe_remove,
    ensure_clean_path,
    ensure_clean_store,
)

pytestmark = pytest.mark.single_cpu


@pytest.mark.filterwarnings("ignore:object name:tables.exceptions.NaturalNameWarning")
def test_append(setup_path):

    with ensure_clean_store(setup_path) as store:

        # this is allowed by almost always don't want to do it
        # tables.NaturalNameWarning):
        with catch_warnings(record=True):

            df = tm.makeTimeDataFrame()
            _maybe_remove(store, "df1")
            store.append("df1", df[:10])
            store.append("df1", df[10:])
            tm.assert_frame_equal(store["df1"], df)

            _maybe_remove(store, "df2")
            store.put("df2", df[:10], format="table")
            store.append("df2", df[10:])
            tm.assert_frame_equal(store["df2"], df)

            _maybe_remove(store, "df3")
            store.append("/df3", df[:10])
            store.append("/df3", df[10:])
            tm.assert_frame_equal(store["df3"], df)

            # this is allowed by almost always don't want to do it
            # tables.NaturalNameWarning
            _maybe_remove(store, "/df3 foo")
            store.append("/df3 foo", df[:10])
            store.append("/df3 foo", df[10:])
            tm.assert_frame_equal(store["df3 foo"], df)

            # dtype issues - mizxed type in a single object column
            df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
            df["mixed_column"] = "testing"
            df.loc[2, "mixed_column"] = np.nan
            _maybe_remove(store, "df")
            store.append("df", df)
            tm.assert_frame_equal(store["df"], df)

            # uints - test storage of uints
            uint_data = DataFrame(
                {
                    "u08": Series(
                        np.random.randint(0, high=255, size=5), dtype=np.uint8
                    ),
                    "u16": Series(
                        np.random.randint(0, high=65535, size=5), dtype=np.uint16
                    ),
                    "u32": Series(
                        np.random.randint(0, high=2**30, size=5), dtype=np.uint32
                    ),
                    "u64": Series(
                        [2**58, 2**59, 2**60, 2**61, 2**62],
                        dtype=np.uint64,
                    ),
                },
                index=np.arange(5),
            )
            _maybe_remove(store, "uints")
            store.append("uints", uint_data)
            tm.assert_frame_equal(store["uints"], uint_data)

            # uints - test storage of uints in indexable columns
            _maybe_remove(store, "uints")
            # 64-bit indices not yet supported
            store.append("uints", uint_data, data_columns=["u08", "u16", "u32"])
            tm.assert_frame_equal(store["uints"], uint_data)


def test_append_series(setup_path):

    with ensure_clean_store(setup_path) as store:

        # basic
        ss = tm.makeStringSeries()
        ts = tm.makeTimeSeries()
        ns = Series(np.arange(100))

        store.append("ss", ss)
        result = store["ss"]
        tm.assert_series_equal(result, ss)
        assert result.name is None

        store.append("ts", ts)
        result = store["ts"]
        tm.assert_series_equal(result, ts)
        assert result.name is None

        ns.name = "foo"
        store.append("ns", ns)
        result = store["ns"]
        tm.assert_series_equal(result, ns)
        assert result.name == ns.name

        # select on the values
        expected = ns[ns > 60]
        result = store.select("ns", "foo>60")
        tm.assert_series_equal(result, expected)

        # select on the index and values
        expected = ns[(ns > 70) & (ns.index < 90)]
        result = store.select("ns", "foo>70 and index<90")
        tm.assert_series_equal(result, expected)

        # multi-index
        mi = DataFrame(np.random.randn(5, 1), columns=["A"])
        mi["B"] = np.arange(len(mi))
        mi["C"] = "foo"
        mi.loc[3:5, "C"] = "bar"
        mi.set_index(["C", "B"], inplace=True)
        s = mi.stack()
        s.index = s.index.droplevel(2)
        store.append("mi", s)
        tm.assert_series_equal(store["mi"], s)


def test_append_some_nans(setup_path):

    with ensure_clean_store(setup_path) as store:
        df = DataFrame(
            {
                "A": Series(np.random.randn(20)).astype("int32"),
                "A1": np.random.randn(20),
                "A2": np.random.randn(20),
                "B": "foo",
                "C": "bar",
                "D": Timestamp("20010101"),
                "E": datetime.datetime(2001, 1, 2, 0, 0),
            },
            index=np.arange(20),
        )
        # some nans
        _maybe_remove(store, "df1")
        df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan
        store.append("df1", df[:10])
        store.append("df1", df[10:])
        tm.assert_frame_equal(store["df1"], df)

        # first column
        df1 = df.copy()
        df1.loc[:, "A1"] = np.nan
        _maybe_remove(store, "df1")
        store.append("df1", df1[:10])
        store.append("df1", df1[10:])
        tm.assert_frame_equal(store["df1"], df1)

        # 2nd column
        df2 = df.copy()
        df2.loc[:, "A2"] = np.nan
        _maybe_remove(store, "df2")
        store.append("df2", df2[:10])
        store.append("df2", df2[10:])
        tm.assert_frame_equal(store["df2"], df2)

        # datetimes
        df3 = df.copy()
        df3.loc[:, "E"] = np.nan
        _maybe_remove(store, "df3")
        store.append("df3", df3[:10])
        store.append("df3", df3[10:])
        tm.assert_frame_equal(store["df3"], df3)


def test_append_all_nans(setup_path):

    with ensure_clean_store(setup_path) as store:

        df = DataFrame(
            {"A1": np.random.randn(20), "A2": np.random.randn(20)},
            index=np.arange(20),
        )
        df.loc[0:15, :] = np.nan

        # nan some entire rows (dropna=True)
        _maybe_remove(store, "df")
        store.append("df", df[:10], dropna=True)
        store.append("df", df[10:], dropna=True)
        tm.assert_frame_equal(store["df"], df[-4:])

        # nan some entire rows (dropna=False)
        _maybe_remove(store, "df2")
        store.append("df2", df[:10], dropna=False)
        store.append("df2", df[10:], dropna=False)
        tm.assert_frame_equal(store["df2"], df)

        # tests the option io.hdf.dropna_table
        pd.set_option("io.hdf.dropna_table", False)
        _maybe_remove(store, "df3")
        store.append("df3", df[:10])
        store.append("df3", df[10:])
        tm.assert_frame_equal(store["df3"], df)

        pd.set_option("io.hdf.dropna_table", True)
        _maybe_remove(store, "df4")
        store.append("df4", df[:10])
        store.append("df4", df[10:])
        tm.assert_frame_equal(store["df4"], df[-4:])

        # nan some entire rows (string are still written!)
        df = DataFrame(
            {
                "A1": np.random.randn(20),
                "A2": np.random.randn(20),
                "B": "foo",
                "C": "bar",
            },
            index=np.arange(20),
        )

        df.loc[0:15, :] = np.nan

        _maybe_remove(store, "df")
        store.append("df", df[:10], dropna=True)
        store.append("df", df[10:], dropna=True)
        tm.assert_frame_equal(store["df"], df)

        _maybe_remove(store, "df2")
        store.append("df2", df[:10], dropna=False)
        store.append("df2", df[10:], dropna=False)
        tm.assert_frame_equal(store["df2"], df)

        # nan some entire rows (but since we have dates they are still
        # written!)
        df = DataFrame(
            {
                "A1": np.random.randn(20),
                "A2": np.random.randn(20),
                "B": "foo",
                "C": "bar",
                "D": Timestamp("20010101"),
                "E": datetime.datetime(2001, 1, 2, 0, 0),
            },
            index=np.arange(20),
        )

        df.loc[0:15, :] = np.nan

        _maybe_remove(store, "df")
        store.append("df", df[:10], dropna=True)
        store.append("df", df[10:], dropna=True)
        tm.assert_frame_equal(store["df"], df)

        _maybe_remove(store, "df2")
        store.append("df2", df[:10], dropna=False)
        store.append("df2", df[10:], dropna=False)
        tm.assert_frame_equal(store["df2"], df)


def test_append_frame_column_oriented(setup_path):
    with ensure_clean_store(setup_path) as store:

        # column oriented
        df = tm.makeTimeDataFrame()
        df.index = df.index._with_freq(None)  # freq doesn't round-trip

        _maybe_remove(store, "df1")
        store.append("df1", df.iloc[:, :2], axes=["columns"])
        store.append("df1", df.iloc[:, 2:])
        tm.assert_frame_equal(store["df1"], df)

        result = store.select("df1", "columns=A")
        expected = df.reindex(columns=["A"])
        tm.assert_frame_equal(expected, result)

        # selection on the non-indexable
        result = store.select("df1", ("columns=A", "index=df.index[0:4]"))
        expected = df.reindex(columns=["A"], index=df.index[0:4])
        tm.assert_frame_equal(expected, result)

        # this isn't supported
        msg = re.escape(
            "passing a filterable condition to a non-table indexer "
            "[Filter: Not Initialized]"
        )
        with pytest.raises(TypeError, match=msg):
            store.select("df1", "columns=A and index>df.index[4]")


def test_append_with_different_block_ordering(setup_path):

    # GH 4096; using same frames, but different block orderings
    with ensure_clean_store(setup_path) as store:

        for i in range(10):

            df = DataFrame(np.random.randn(10, 2), columns=list("AB"))
            df["index"] = range(10)
            df["index"] += i * 10
            df["int64"] = Series([1] * len(df), dtype="int64")
            df["int16"] = Series([1] * len(df), dtype="int16")

            if i % 2 == 0:
                del df["int64"]
                df["int64"] = Series([1] * len(df), dtype="int64")
            if i % 3 == 0:
                a = df.pop("A")
                df["A"] = a

            df.set_index("index", inplace=True)

            store.append("df", df)

    # test a different ordering but with more fields (like invalid
    # combinations)
    with ensure_clean_store(setup_path) as store:

        df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64")
        df["int64"] = Series([1] * len(df), dtype="int64")
        df["int16"] = Series([1] * len(df), dtype="int16")
        store.append("df", df)

        # store additional fields in different blocks
        df["int16_2"] = Series([1] * len(df), dtype="int16")
        msg = re.escape(
            "cannot match existing table structure for [int16] on appending data"
        )
        with pytest.raises(ValueError, match=msg):
            store.append("df", df)

        # store multiple additional fields in different blocks
        df["float_3"] = Series([1.0] * len(df), dtype="float64")
        msg = re.escape(
            "cannot match existing table structure for [A,B] on appending data"
        )
        with pytest.raises(ValueError, match=msg):
            store.append("df", df)


def test_append_with_strings(setup_path):

    with ensure_clean_store(setup_path) as store:
        with catch_warnings(record=True):

            def check_col(key, name, size):
                assert (
                    getattr(store.get_storer(key).table.description, name).itemsize
                    == size
                )

            # avoid truncation on elements
            df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
            store.append("df_big", df)
            tm.assert_frame_equal(store.select("df_big"), df)
            check_col("df_big", "values_block_1", 15)

            # appending smaller string ok
            df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]])
            store.append("df_big", df2)
            expected = concat([df, df2])
            tm.assert_frame_equal(store.select("df_big"), expected)
            check_col("df_big", "values_block_1", 15)

            # avoid truncation on elements
            df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
            store.append("df_big2", df, min_itemsize={"values": 50})
            tm.assert_frame_equal(store.select("df_big2"), df)
            check_col("df_big2", "values_block_1", 50)

            # bigger string on next append
            store.append("df_new", df)
            df_new = DataFrame(
                [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]]
            )
            msg = (
                r"Trying to store a string with len \[26\] in "
                r"\[values_block_1\] column but\n"
                r"this column has a limit of \[15\]!\n"
                "Consider using min_itemsize to preset the sizes on these "
                "columns"
            )
            with pytest.raises(ValueError, match=msg):
                store.append("df_new", df_new)

            # min_itemsize on Series index (GH 11412)
            df = tm.makeMixedDataFrame().set_index("C")
            store.append("ss", df["B"], min_itemsize={"index": 4})
            tm.assert_series_equal(store.select("ss"), df["B"])

            # same as above, with data_columns=True
            store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4})
            tm.assert_series_equal(store.select("ss2"), df["B"])

            # min_itemsize in index without appending (GH 10381)
            store.put("ss3", df, format="table", min_itemsize={"index": 6})
            # just make sure there is a longer string:
            df2 = df.copy().reset_index().assign(C="longer").set_index("C")
            store.append("ss3", df2)
            tm.assert_frame_equal(store.select("ss3"), concat([df, df2]))

            # same as above, with a Series
            store.put("ss4", df["B"], format="table", min_itemsize={"index": 6})
            store.append("ss4", df2["B"])
            tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]]))

            # with nans
            _maybe_remove(store, "df")
            df = tm.makeTimeDataFrame()
            df["string"] = "foo"
            df.loc[df.index[1:4], "string"] = np.nan
            df["string2"] = "bar"
            df.loc[df.index[4:8], "string2"] = np.nan
            df["string3"] = "bah"
            df.loc[df.index[1:], "string3"] = np.nan
            store.append("df", df)
            result = store.select("df")
            tm.assert_frame_equal(result, df)

    with ensure_clean_store(setup_path) as store:

        def check_col(key, name, size):
            assert getattr(store.get_storer(key).table.description, name).itemsize, size

        df = DataFrame({"A": "foo", "B": "bar"}, index=range(10))

        # a min_itemsize that creates a data_column
        _maybe_remove(store, "df")
        store.append("df", df, min_itemsize={"A": 200})
        check_col("df", "A", 200)
        assert store.get_storer("df").data_columns == ["A"]

        # a min_itemsize that creates a data_column2
        _maybe_remove(store, "df")
        store.append("df", df, data_columns=["B"], min_itemsize={"A": 200})
        check_col("df", "A", 200)
        assert store.get_storer("df").data_columns == ["B", "A"]

        # a min_itemsize that creates a data_column2
        _maybe_remove(store, "df")
        store.append("df", df, data_columns=["B"], min_itemsize={"values": 200})
        check_col("df", "B", 200)
        check_col("df", "values_block_0", 200)
        assert store.get_storer("df").data_columns == ["B"]

        # infer the .typ on subsequent appends
        _maybe_remove(store, "df")
        store.append("df", df[:5], min_itemsize=200)
        store.append("df", df[5:], min_itemsize=200)
        tm.assert_frame_equal(store["df"], df)

        # invalid min_itemsize keys
        df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"])
        _maybe_remove(store, "df")
        msg = re.escape(
            "min_itemsize has the key [foo] which is not an axis or data_column"
        )
        with pytest.raises(ValueError, match=msg):
            store.append("df", df, min_itemsize={"foo": 20, "foobar": 20})


def test_append_with_empty_string(setup_path):

    with ensure_clean_store(setup_path) as store:

        # with all empty strings (GH 12242)
        df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]})
        store.append("df", df[:-1], min_itemsize={"x": 1})
        store.append("df", df[-1:], min_itemsize={"x": 1})
        tm.assert_frame_equal(store.select("df"), df)


def test_append_with_data_columns(setup_path):

    with ensure_clean_store(setup_path) as store:
        df = tm.makeTimeDataFrame()
        df.iloc[0, df.columns.get_loc("B")] = 1.0
        _maybe_remove(store, "df")
        store.append("df", df[:2], data_columns=["B"])
        store.append("df", df[2:])
        tm.assert_frame_equal(store["df"], df)

        # check that we have indices created
        assert store._handle.root.df.table.cols.index.is_indexed is True
        assert store._handle.root.df.table.cols.B.is_indexed is True

        # data column searching
        result = store.select("df", "B>0")
        expected = df[df.B > 0]
        tm.assert_frame_equal(result, expected)

        # data column searching (with an indexable and a data_columns)
        result = store.select("df", "B>0 and index>df.index[3]")
        df_new = df.reindex(index=df.index[4:])
        expected = df_new[df_new.B > 0]
        tm.assert_frame_equal(result, expected)

        # data column selection with a string data_column
        df_new = df.copy()
        df_new["string"] = "foo"
        df_new.loc[df_new.index[1:4], "string"] = np.nan
        df_new.loc[df_new.index[5:6], "string"] = "bar"
        _maybe_remove(store, "df")
        store.append("df", df_new, data_columns=["string"])
        result = store.select("df", "string='foo'")
        expected = df_new[df_new.string == "foo"]
        tm.assert_frame_equal(result, expected)

        # using min_itemsize and a data column
        def check_col(key, name, size):
            assert (
                getattr(store.get_storer(key).table.description, name).itemsize == size
            )

    with ensure_clean_store(setup_path) as store:
        _maybe_remove(store, "df")
        store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30})
        check_col("df", "string", 30)
        _maybe_remove(store, "df")
        store.append("df", df_new, data_columns=["string"], min_itemsize=30)
        check_col("df", "string", 30)
        _maybe_remove(store, "df")
        store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30})
        check_col("df", "string", 30)

    with ensure_clean_store(setup_path) as store:
        df_new["string2"] = "foobarbah"
        df_new["string_block1"] = "foobarbah1"
        df_new["string_block2"] = "foobarbah2"
        _maybe_remove(store, "df")
        store.append(
            "df",
            df_new,
            data_columns=["string", "string2"],
            min_itemsize={"string": 30, "string2": 40, "values": 50},
        )
        check_col("df", "string", 30)
        check_col("df", "string2", 40)
        check_col("df", "values_block_1", 50)

    with ensure_clean_store(setup_path) as store:
        # multiple data columns
        df_new = df.copy()
        df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0
        df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0
        df_new["string"] = "foo"

        sl = df_new.columns.get_loc("string")
        df_new.iloc[1:4, sl] = np.nan
        df_new.iloc[5:6, sl] = "bar"

        df_new["string2"] = "foo"
        sl = df_new.columns.get_loc("string2")
        df_new.iloc[2:5, sl] = np.nan
        df_new.iloc[7:8, sl] = "bar"
        _maybe_remove(store, "df")
        store.append("df", df_new, data_columns=["A", "B", "string", "string2"])
        result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0")
        expected = df_new[
            (df_new.string == "foo")
            & (df_new.string2 == "foo")
            & (df_new.A > 0)
            & (df_new.B < 0)
        ]
        tm.assert_frame_equal(result, expected, check_freq=False)
        # FIXME: 2020-05-07 freq check randomly fails in the CI

        # yield an empty frame
        result = store.select("df", "string='foo' and string2='cool'")
        expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")]
        tm.assert_frame_equal(result, expected)

    with ensure_clean_store(setup_path) as store:
        # doc example
        df_dc = df.copy()
        df_dc["string"] = "foo"
        df_dc.loc[df_dc.index[4:6], "string"] = np.nan
        df_dc.loc[df_dc.index[7:9], "string"] = "bar"
        df_dc["string2"] = "cool"
        df_dc["datetime"] = Timestamp("20010102")
        df_dc = df_dc._convert(datetime=True)
        df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan

        _maybe_remove(store, "df_dc")
        store.append(
            "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"]
        )
        result = store.select("df_dc", "B>0")

        expected = df_dc[df_dc.B > 0]
        tm.assert_frame_equal(result, expected)

        result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"])
        expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
        tm.assert_frame_equal(result, expected, check_freq=False)
        # FIXME: 2020-12-07 intermittent build failures here with freq of
        #  None instead of BDay(4)

    with ensure_clean_store(setup_path) as store:
        # doc example part 2
        np.random.seed(1234)
        index = date_range("1/1/2000", periods=8)
        df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])
        df_dc["string"] = "foo"
        df_dc.loc[df_dc.index[4:6], "string"] = np.nan
        df_dc.loc[df_dc.index[7:9], "string"] = "bar"
        df_dc.loc[:, ["B", "C"]] = df_dc.loc[:, ["B", "C"]].abs()
        df_dc["string2"] = "cool"

        # on-disk operations
        store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"])

        result = store.select("df_dc", "B>0")
        expected = df_dc[df_dc.B > 0]
        tm.assert_frame_equal(result, expected)

        result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"'])
        expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
        tm.assert_frame_equal(result, expected)


def test_append_hierarchical(setup_path, multiindex_dataframe_random_data):
    df = multiindex_dataframe_random_data
    df.columns.name = None

    with ensure_clean_store(setup_path) as store:
        store.append("mi", df)
        result = store.select("mi")
        tm.assert_frame_equal(result, df)

        # GH 3748
        result = store.select("mi", columns=["A", "B"])
        expected = df.reindex(columns=["A", "B"])
        tm.assert_frame_equal(result, expected)

    with ensure_clean_path("test.hdf") as path:
        df.to_hdf(path, "df", format="table")
        result = read_hdf(path, "df", columns=["A", "B"])
        expected = df.reindex(columns=["A", "B"])
        tm.assert_frame_equal(result, expected)


def test_append_misc(setup_path):

    with ensure_clean_store(setup_path) as store:
        df = tm.makeDataFrame()
        store.append("df", df, chunksize=1)
        result = store.select("df")
        tm.assert_frame_equal(result, df)

        store.append("df1", df, expectedrows=10)
        result = store.select("df1")
        tm.assert_frame_equal(result, df)


@pytest.mark.parametrize("chunksize", [10, 200, 1000])
def test_append_misc_chunksize(setup_path, chunksize):
    # more chunksize in append tests
    df = tm.makeDataFrame()
    df["string"] = "foo"
    df["float322"] = 1.0
    df["float322"] = df["float322"].astype("float32")
    df["bool"] = df["float322"] > 0
    df["time1"] = Timestamp("20130101")
    df["time2"] = Timestamp("20130102")
    with ensure_clean_store(setup_path, mode="w") as store:
        store.append("obj", df, chunksize=chunksize)
        result = store.select("obj")
        tm.assert_frame_equal(result, df)


def test_append_misc_empty_frame(setup_path):
    # empty frame, GH4273
    with ensure_clean_store(setup_path) as store:

        # 0 len
        df_empty = DataFrame(columns=list("ABC"))
        store.append("df", df_empty)
        with pytest.raises(KeyError, match="'No object named df in the file'"):
            store.select("df")

        # repeated append of 0/non-zero frames
        df = DataFrame(np.random.rand(10, 3), columns=list("ABC"))
        store.append("df", df)
        tm.assert_frame_equal(store.select("df"), df)
        store.append("df", df_empty)
        tm.assert_frame_equal(store.select("df"), df)

        # store
        df = DataFrame(columns=list("ABC"))
        store.put("df2", df)
        tm.assert_frame_equal(store.select("df2"), df)


# TODO(ArrayManager) currently we rely on falling back to BlockManager, but
# the conversion from AM->BM converts the invalid object dtype column into
# a datetime64 column no longer raising an error
@td.skip_array_manager_not_yet_implemented
def test_append_raise(setup_path):

    with ensure_clean_store(setup_path) as store:

        # test append with invalid input to get good error messages

        # list in column
        df = tm.makeDataFrame()
        df["invalid"] = [["a"]] * len(df)
        assert df.dtypes["invalid"] == np.object_
        msg = re.escape(
            """Cannot serialize the column [invalid]
because its data contents are not [string] but [mixed] object dtype"""
        )
        with pytest.raises(TypeError, match=msg):
            store.append("df", df)

        # multiple invalid columns
        df["invalid2"] = [["a"]] * len(df)
        df["invalid3"] = [["a"]] * len(df)
        with pytest.raises(TypeError, match=msg):
            store.append("df", df)

        # datetime with embedded nans as object
        df = tm.makeDataFrame()
        s = Series(datetime.datetime(2001, 1, 2), index=df.index)
        s = s.astype(object)
        s[0:5] = np.nan
        df["invalid"] = s
        assert df.dtypes["invalid"] == np.object_
        msg = "too many timezones in this block, create separate data columns"
        with pytest.raises(TypeError, match=msg):
            store.append("df", df)

        # directly ndarray
        msg = "value must be None, Series, or DataFrame"
        with pytest.raises(TypeError, match=msg):
            store.append("df", np.arange(10))

        # series directly
        msg = re.escape(
            "cannot properly create the storer for: "
            "[group->df,value-><class 'pandas.core.series.Series'>]"
        )
        with pytest.raises(TypeError, match=msg):
            store.append("df", Series(np.arange(10)))

        # appending an incompatible table
        df = tm.makeDataFrame()
        store.append("df", df)

        df["foo"] = "foo"
        msg = re.escape(
            "invalid combination of [non_index_axes] on appending data "
            "[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table "
            "[(1, ['A', 'B', 'C', 'D'])]"
        )
        with pytest.raises(ValueError, match=msg):
            store.append("df", df)

        # incompatible type (GH 41897)
        _maybe_remove(store, "df")
        df["foo"] = Timestamp("20130101")
        store.append("df", df)
        df["foo"] = "bar"
        msg = re.escape(
            "invalid combination of [values_axes] on appending data "
            "[name->values_block_1,cname->values_block_1,"
            "dtype->bytes24,kind->string,shape->(1, 30)] "
            "vs current table "
            "[name->values_block_1,cname->values_block_1,"
            "dtype->datetime64,kind->datetime64,shape->None]"
        )
        with pytest.raises(ValueError, match=msg):
            store.append("df", df)


def test_append_with_timedelta(setup_path):
    # GH 3577
    # append timedelta

    df = DataFrame(
        {
            "A": Timestamp("20130101"),
            "B": [
                Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10)
            ],
        }
    )
    df["C"] = df["A"] - df["B"]
    df.loc[3:5, "C"] = np.nan

    with ensure_clean_store(setup_path) as store:

        # table
        _maybe_remove(store, "df")
        store.append("df", df, data_columns=True)
        result = store.select("df")
        tm.assert_frame_equal(result, df)

        result = store.select("df", where="C<100000")
        tm.assert_frame_equal(result, df)

        result = store.select("df", where="C<pd.Timedelta('-3D')")
        tm.assert_frame_equal(result, df.iloc[3:])

        result = store.select("df", "C<'-3D'")
        tm.assert_frame_equal(result, df.iloc[3:])

        # a bit hacky here as we don't really deal with the NaT properly

        result = store.select("df", "C<'-500000s'")
        result = result.dropna(subset=["C"])
        tm.assert_frame_equal(result, df.iloc[6:])

        result = store.select("df", "C<'-3.5D'")
        result = result.iloc[1:]
        tm.assert_frame_equal(result, df.iloc[4:])

        # fixed
        _maybe_remove(store, "df2")
        store.put("df2", df)
        result = store.select("df2")
        tm.assert_frame_equal(result, df)


def test_append_to_multiple(setup_path):
    df1 = tm.makeTimeDataFrame()
    df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
    df2["foo"] = "bar"
    df = concat([df1, df2], axis=1)

    with ensure_clean_store(setup_path) as store:

        # exceptions
        msg = "append_to_multiple requires a selector that is in passed dict"
        with pytest.raises(ValueError, match=msg):
            store.append_to_multiple(
                {"df1": ["A", "B"], "df2": None}, df, selector="df3"
            )

        with pytest.raises(ValueError, match=msg):
            store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3")

        msg = (
            "append_to_multiple must have a dictionary specified as the way to "
            "split the value"
        )
        with pytest.raises(ValueError, match=msg):
            store.append_to_multiple("df1", df, "df1")

        # regular operation
        store.append_to_multiple({"df1": ["A", "B"], "df2": None}, df, selector="df1")
        result = store.select_as_multiple(
            ["df1", "df2"], where=["A>0", "B>0"], selector="df1"
        )
        expected = df[(df.A > 0) & (df.B > 0)]
        tm.assert_frame_equal(result, expected)


def test_append_to_multiple_dropna(setup_path):
    df1 = tm.makeTimeDataFrame()
    df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
    df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
    df = concat([df1, df2], axis=1)

    with ensure_clean_store(setup_path) as store:

        # dropna=True should guarantee rows are synchronized
        store.append_to_multiple(
            {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True
        )
        result = store.select_as_multiple(["df1", "df2"])
        expected = df.dropna()
        tm.assert_frame_equal(result, expected)
        tm.assert_index_equal(store.select("df1").index, store.select("df2").index)


def test_append_to_multiple_dropna_false(setup_path):
    df1 = tm.makeTimeDataFrame()
    df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
    df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
    df = concat([df1, df2], axis=1)

    with ensure_clean_store(setup_path) as store:

        # dropna=False shouldn't synchronize row indexes
        store.append_to_multiple(
            {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False
        )

        msg = "all tables must have exactly the same nrows!"
        with pytest.raises(ValueError, match=msg):
            store.select_as_multiple(["df1a", "df2a"])

        assert not store.select("df1a").index.equals(store.select("df2a").index)


def test_append_to_multiple_min_itemsize(setup_path):
    # GH 11238
    df = DataFrame(
        {
            "IX": np.arange(1, 21),
            "Num": np.arange(1, 21),
            "BigNum": np.arange(1, 21) * 88,
            "Str": ["a" for _ in range(20)],
            "LongStr": ["abcde" for _ in range(20)],
        }
    )
    expected = df.iloc[[0]]

    with ensure_clean_store(setup_path) as store:
        store.append_to_multiple(
            {
                "index": ["IX"],
                "nums": ["Num", "BigNum"],
                "strs": ["Str", "LongStr"],
            },
            df.iloc[[0]],
            "index",
            min_itemsize={"Str": 10, "LongStr": 100, "Num": 2},
        )
        result = store.select_as_multiple(["index", "nums", "strs"])
        tm.assert_frame_equal(result, expected)