977 lines
		
	
	
		
			33 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			977 lines
		
	
	
		
			33 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from warnings import catch_warnings
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
from pandas._libs.tslibs import Timestamp
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas import (
 | 
						|
    DataFrame,
 | 
						|
    HDFStore,
 | 
						|
    Index,
 | 
						|
    MultiIndex,
 | 
						|
    Series,
 | 
						|
    _testing as tm,
 | 
						|
    bdate_range,
 | 
						|
    concat,
 | 
						|
    date_range,
 | 
						|
    isna,
 | 
						|
    read_hdf,
 | 
						|
)
 | 
						|
from pandas.tests.io.pytables.common import (
 | 
						|
    _maybe_remove,
 | 
						|
    ensure_clean_path,
 | 
						|
    ensure_clean_store,
 | 
						|
)
 | 
						|
 | 
						|
from pandas.io.pytables import Term
 | 
						|
 | 
						|
pytestmark = pytest.mark.single_cpu
 | 
						|
 | 
						|
 | 
						|
def test_select_columns_in_where(setup_path):
 | 
						|
 | 
						|
    # GH 6169
 | 
						|
    # recreate multi-indexes when columns is passed
 | 
						|
    # in the `where` argument
 | 
						|
    index = MultiIndex(
 | 
						|
        levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
 | 
						|
        codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
 | 
						|
        names=["foo_name", "bar_name"],
 | 
						|
    )
 | 
						|
 | 
						|
    # With a DataFrame
 | 
						|
    df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.put("df", df, format="table")
 | 
						|
        expected = df[["A"]]
 | 
						|
 | 
						|
        tm.assert_frame_equal(store.select("df", columns=["A"]), expected)
 | 
						|
 | 
						|
        tm.assert_frame_equal(store.select("df", where="columns=['A']"), expected)
 | 
						|
 | 
						|
    # With a Series
 | 
						|
    s = Series(np.random.randn(10), index=index, name="A")
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.put("s", s, format="table")
 | 
						|
        tm.assert_series_equal(store.select("s", where="columns=['A']"), s)
 | 
						|
 | 
						|
 | 
						|
def test_select_with_dups(setup_path):
 | 
						|
 | 
						|
    # single dtypes
 | 
						|
    df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"])
 | 
						|
    df.index = date_range("20130101 9:30", periods=10, freq="T")
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.append("df", df)
 | 
						|
 | 
						|
        result = store.select("df")
 | 
						|
        expected = df
 | 
						|
        tm.assert_frame_equal(result, expected, by_blocks=True)
 | 
						|
 | 
						|
        result = store.select("df", columns=df.columns)
 | 
						|
        expected = df
 | 
						|
        tm.assert_frame_equal(result, expected, by_blocks=True)
 | 
						|
 | 
						|
        result = store.select("df", columns=["A"])
 | 
						|
        expected = df.loc[:, ["A"]]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    # dups across dtypes
 | 
						|
    df = concat(
 | 
						|
        [
 | 
						|
            DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
 | 
						|
            DataFrame(
 | 
						|
                np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
 | 
						|
            ),
 | 
						|
        ],
 | 
						|
        axis=1,
 | 
						|
    )
 | 
						|
    df.index = date_range("20130101 9:30", periods=10, freq="T")
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.append("df", df)
 | 
						|
 | 
						|
        result = store.select("df")
 | 
						|
        expected = df
 | 
						|
        tm.assert_frame_equal(result, expected, by_blocks=True)
 | 
						|
 | 
						|
        result = store.select("df", columns=df.columns)
 | 
						|
        expected = df
 | 
						|
        tm.assert_frame_equal(result, expected, by_blocks=True)
 | 
						|
 | 
						|
        expected = df.loc[:, ["A"]]
 | 
						|
        result = store.select("df", columns=["A"])
 | 
						|
        tm.assert_frame_equal(result, expected, by_blocks=True)
 | 
						|
 | 
						|
        expected = df.loc[:, ["B", "A"]]
 | 
						|
        result = store.select("df", columns=["B", "A"])
 | 
						|
        tm.assert_frame_equal(result, expected, by_blocks=True)
 | 
						|
 | 
						|
    # duplicates on both index and columns
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.append("df", df)
 | 
						|
        store.append("df", df)
 | 
						|
 | 
						|
        expected = df.loc[:, ["B", "A"]]
 | 
						|
        expected = concat([expected, expected])
 | 
						|
        result = store.select("df", columns=["B", "A"])
 | 
						|
        tm.assert_frame_equal(result, expected, by_blocks=True)
 | 
						|
 | 
						|
 | 
						|
def test_select(setup_path):
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        with catch_warnings(record=True):
 | 
						|
 | 
						|
            # select with columns=
 | 
						|
            df = tm.makeTimeDataFrame()
 | 
						|
            _maybe_remove(store, "df")
 | 
						|
            store.append("df", df)
 | 
						|
            result = store.select("df", columns=["A", "B"])
 | 
						|
            expected = df.reindex(columns=["A", "B"])
 | 
						|
            tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
            # equivalently
 | 
						|
            result = store.select("df", [("columns=['A', 'B']")])
 | 
						|
            expected = df.reindex(columns=["A", "B"])
 | 
						|
            tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
            # with a data column
 | 
						|
            _maybe_remove(store, "df")
 | 
						|
            store.append("df", df, data_columns=["A"])
 | 
						|
            result = store.select("df", ["A > 0"], columns=["A", "B"])
 | 
						|
            expected = df[df.A > 0].reindex(columns=["A", "B"])
 | 
						|
            tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
            # all a data columns
 | 
						|
            _maybe_remove(store, "df")
 | 
						|
            store.append("df", df, data_columns=True)
 | 
						|
            result = store.select("df", ["A > 0"], columns=["A", "B"])
 | 
						|
            expected = df[df.A > 0].reindex(columns=["A", "B"])
 | 
						|
            tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
            # with a data column, but different columns
 | 
						|
            _maybe_remove(store, "df")
 | 
						|
            store.append("df", df, data_columns=["A"])
 | 
						|
            result = store.select("df", ["A > 0"], columns=["C", "D"])
 | 
						|
            expected = df[df.A > 0].reindex(columns=["C", "D"])
 | 
						|
            tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
 | 
						|
def test_select_dtypes(setup_path):
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        # with a Timestamp data column (GH #2637)
 | 
						|
        df = DataFrame(
 | 
						|
            {
 | 
						|
                "ts": bdate_range("2012-01-01", periods=300),
 | 
						|
                "A": np.random.randn(300),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        _maybe_remove(store, "df")
 | 
						|
        store.append("df", df, data_columns=["ts", "A"])
 | 
						|
 | 
						|
        result = store.select("df", "ts>=Timestamp('2012-02-01')")
 | 
						|
        expected = df[df.ts >= Timestamp("2012-02-01")]
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # bool columns (GH #2849)
 | 
						|
        df = DataFrame(np.random.randn(5, 2), columns=["A", "B"])
 | 
						|
        df["object"] = "foo"
 | 
						|
        df.loc[4:5, "object"] = "bar"
 | 
						|
        df["boolv"] = df["A"] > 0
 | 
						|
        _maybe_remove(store, "df")
 | 
						|
        store.append("df", df, data_columns=True)
 | 
						|
 | 
						|
        expected = df[df.boolv == True].reindex(columns=["A", "boolv"])  # noqa:E712
 | 
						|
        for v in [True, "true", 1]:
 | 
						|
            result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
 | 
						|
            tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        expected = df[df.boolv == False].reindex(columns=["A", "boolv"])  # noqa:E712
 | 
						|
        for v in [False, "false", 0]:
 | 
						|
            result = store.select("df", f"boolv == {v}", columns=["A", "boolv"])
 | 
						|
            tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # integer index
 | 
						|
        df = DataFrame({"A": np.random.rand(20), "B": np.random.rand(20)})
 | 
						|
        _maybe_remove(store, "df_int")
 | 
						|
        store.append("df_int", df)
 | 
						|
        result = store.select("df_int", "index<10 and columns=['A']")
 | 
						|
        expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # float index
 | 
						|
        df = DataFrame(
 | 
						|
            {
 | 
						|
                "A": np.random.rand(20),
 | 
						|
                "B": np.random.rand(20),
 | 
						|
                "index": np.arange(20, dtype="f8"),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        _maybe_remove(store, "df_float")
 | 
						|
        store.append("df_float", df)
 | 
						|
        result = store.select("df_float", "index<10.0 and columns=['A']")
 | 
						|
        expected = df.reindex(index=list(df.index)[0:10], columns=["A"])
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        # floats w/o NaN
 | 
						|
        df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
 | 
						|
        df["cols"] = (df["cols"] + 10).apply(str)
 | 
						|
 | 
						|
        store.append("df1", df, data_columns=True)
 | 
						|
        result = store.select("df1", where="values>2.0")
 | 
						|
        expected = df[df["values"] > 2.0]
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # floats with NaN
 | 
						|
        df.iloc[0] = np.nan
 | 
						|
        expected = df[df["values"] > 2.0]
 | 
						|
 | 
						|
        store.append("df2", df, data_columns=True, index=False)
 | 
						|
        result = store.select("df2", where="values>2.0")
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # https://github.com/PyTables/PyTables/issues/282
 | 
						|
        # bug in selection when 0th row has a np.nan and an index
 | 
						|
        # store.append('df3',df,data_columns=True)
 | 
						|
        # result = store.select(
 | 
						|
        #    'df3', where='values>2.0')
 | 
						|
        # tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # not in first position float with NaN ok too
 | 
						|
        df = DataFrame({"cols": range(11), "values": range(11)}, dtype="float64")
 | 
						|
        df["cols"] = (df["cols"] + 10).apply(str)
 | 
						|
 | 
						|
        df.iloc[1] = np.nan
 | 
						|
        expected = df[df["values"] > 2.0]
 | 
						|
 | 
						|
        store.append("df4", df, data_columns=True)
 | 
						|
        result = store.select("df4", where="values>2.0")
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
    # test selection with comparison against numpy scalar
 | 
						|
    # GH 11283
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        df = tm.makeDataFrame()
 | 
						|
 | 
						|
        expected = df[df["A"] > 0]
 | 
						|
 | 
						|
        store.append("df", df, data_columns=True)
 | 
						|
        np_zero = np.float64(0)  # noqa:F841
 | 
						|
        result = store.select("df", where=["A>np_zero"])
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
 | 
						|
def test_select_with_many_inputs(setup_path):
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        df = DataFrame(
 | 
						|
            {
 | 
						|
                "ts": bdate_range("2012-01-01", periods=300),
 | 
						|
                "A": np.random.randn(300),
 | 
						|
                "B": range(300),
 | 
						|
                "users": ["a"] * 50
 | 
						|
                + ["b"] * 50
 | 
						|
                + ["c"] * 100
 | 
						|
                + [f"a{i:03d}" for i in range(100)],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        _maybe_remove(store, "df")
 | 
						|
        store.append("df", df, data_columns=["ts", "A", "B", "users"])
 | 
						|
 | 
						|
        # regular select
 | 
						|
        result = store.select("df", "ts>=Timestamp('2012-02-01')")
 | 
						|
        expected = df[df.ts >= Timestamp("2012-02-01")]
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # small selector
 | 
						|
        result = store.select("df", "ts>=Timestamp('2012-02-01') & users=['a','b','c']")
 | 
						|
        expected = df[
 | 
						|
            (df.ts >= Timestamp("2012-02-01")) & df.users.isin(["a", "b", "c"])
 | 
						|
        ]
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # big selector along the columns
 | 
						|
        selector = ["a", "b", "c"] + [f"a{i:03d}" for i in range(60)]
 | 
						|
        result = store.select("df", "ts>=Timestamp('2012-02-01') and users=selector")
 | 
						|
        expected = df[(df.ts >= Timestamp("2012-02-01")) & df.users.isin(selector)]
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        selector = range(100, 200)
 | 
						|
        result = store.select("df", "B=selector")
 | 
						|
        expected = df[df.B.isin(selector)]
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
        assert len(result) == 100
 | 
						|
 | 
						|
        # big selector along the index
 | 
						|
        selector = Index(df.ts[0:100].values)
 | 
						|
        result = store.select("df", "ts=selector")
 | 
						|
        expected = df[df.ts.isin(selector.values)]
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
        assert len(result) == 100
 | 
						|
 | 
						|
 | 
						|
def test_select_iterator(setup_path):
 | 
						|
 | 
						|
    # single table
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        df = tm.makeTimeDataFrame(500)
 | 
						|
        _maybe_remove(store, "df")
 | 
						|
        store.append("df", df)
 | 
						|
 | 
						|
        expected = store.select("df")
 | 
						|
 | 
						|
        results = list(store.select("df", iterator=True))
 | 
						|
        result = concat(results)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        results = list(store.select("df", chunksize=100))
 | 
						|
        assert len(results) == 5
 | 
						|
        result = concat(results)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        results = list(store.select("df", chunksize=150))
 | 
						|
        result = concat(results)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    with ensure_clean_path(setup_path) as path:
 | 
						|
 | 
						|
        df = tm.makeTimeDataFrame(500)
 | 
						|
        df.to_hdf(path, "df_non_table")
 | 
						|
 | 
						|
        msg = "can only use an iterator or chunksize on a table"
 | 
						|
        with pytest.raises(TypeError, match=msg):
 | 
						|
            read_hdf(path, "df_non_table", chunksize=100)
 | 
						|
 | 
						|
        with pytest.raises(TypeError, match=msg):
 | 
						|
            read_hdf(path, "df_non_table", iterator=True)
 | 
						|
 | 
						|
    with ensure_clean_path(setup_path) as path:
 | 
						|
 | 
						|
        df = tm.makeTimeDataFrame(500)
 | 
						|
        df.to_hdf(path, "df", format="table")
 | 
						|
 | 
						|
        results = list(read_hdf(path, "df", chunksize=100))
 | 
						|
        result = concat(results)
 | 
						|
 | 
						|
        assert len(results) == 5
 | 
						|
        tm.assert_frame_equal(result, df)
 | 
						|
        tm.assert_frame_equal(result, read_hdf(path, "df"))
 | 
						|
 | 
						|
    # multiple
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        df1 = tm.makeTimeDataFrame(500)
 | 
						|
        store.append("df1", df1, data_columns=True)
 | 
						|
        df2 = tm.makeTimeDataFrame(500).rename(columns="{}_2".format)
 | 
						|
        df2["foo"] = "bar"
 | 
						|
        store.append("df2", df2)
 | 
						|
 | 
						|
        df = concat([df1, df2], axis=1)
 | 
						|
 | 
						|
        # full selection
 | 
						|
        expected = store.select_as_multiple(["df1", "df2"], selector="df1")
 | 
						|
        results = list(
 | 
						|
            store.select_as_multiple(["df1", "df2"], selector="df1", chunksize=150)
 | 
						|
        )
 | 
						|
        result = concat(results)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
 | 
						|
def test_select_iterator_complete_8014(setup_path):
 | 
						|
 | 
						|
    # GH 8014
 | 
						|
    # using iterator and where clause
 | 
						|
    chunksize = 1e4
 | 
						|
 | 
						|
    # no iterator
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        expected = tm.makeTimeDataFrame(100064, "S")
 | 
						|
        _maybe_remove(store, "df")
 | 
						|
        store.append("df", expected)
 | 
						|
 | 
						|
        beg_dt = expected.index[0]
 | 
						|
        end_dt = expected.index[-1]
 | 
						|
 | 
						|
        # select w/o iteration and no where clause works
 | 
						|
        result = store.select("df")
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # select w/o iterator and where clause, single term, begin
 | 
						|
        # of range, works
 | 
						|
        where = f"index >= '{beg_dt}'"
 | 
						|
        result = store.select("df", where=where)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # select w/o iterator and where clause, single term, end
 | 
						|
        # of range, works
 | 
						|
        where = f"index <= '{end_dt}'"
 | 
						|
        result = store.select("df", where=where)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # select w/o iterator and where clause, inclusive range,
 | 
						|
        # works
 | 
						|
        where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
 | 
						|
        result = store.select("df", where=where)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
    # with iterator, full range
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        expected = tm.makeTimeDataFrame(100064, "S")
 | 
						|
        _maybe_remove(store, "df")
 | 
						|
        store.append("df", expected)
 | 
						|
 | 
						|
        beg_dt = expected.index[0]
 | 
						|
        end_dt = expected.index[-1]
 | 
						|
 | 
						|
        # select w/iterator and no where clause works
 | 
						|
        results = list(store.select("df", chunksize=chunksize))
 | 
						|
        result = concat(results)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # select w/iterator and where clause, single term, begin of range
 | 
						|
        where = f"index >= '{beg_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
        result = concat(results)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # select w/iterator and where clause, single term, end of range
 | 
						|
        where = f"index <= '{end_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
        result = concat(results)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        # select w/iterator and where clause, inclusive range
 | 
						|
        where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
        result = concat(results)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
 | 
						|
def test_select_iterator_non_complete_8014(setup_path):
 | 
						|
 | 
						|
    # GH 8014
 | 
						|
    # using iterator and where clause
 | 
						|
    chunksize = 1e4
 | 
						|
 | 
						|
    # with iterator, non complete range
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        expected = tm.makeTimeDataFrame(100064, "S")
 | 
						|
        _maybe_remove(store, "df")
 | 
						|
        store.append("df", expected)
 | 
						|
 | 
						|
        beg_dt = expected.index[1]
 | 
						|
        end_dt = expected.index[-2]
 | 
						|
 | 
						|
        # select w/iterator and where clause, single term, begin of range
 | 
						|
        where = f"index >= '{beg_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
        result = concat(results)
 | 
						|
        rexpected = expected[expected.index >= beg_dt]
 | 
						|
        tm.assert_frame_equal(rexpected, result)
 | 
						|
 | 
						|
        # select w/iterator and where clause, single term, end of range
 | 
						|
        where = f"index <= '{end_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
        result = concat(results)
 | 
						|
        rexpected = expected[expected.index <= end_dt]
 | 
						|
        tm.assert_frame_equal(rexpected, result)
 | 
						|
 | 
						|
        # select w/iterator and where clause, inclusive range
 | 
						|
        where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
        result = concat(results)
 | 
						|
        rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
 | 
						|
        tm.assert_frame_equal(rexpected, result)
 | 
						|
 | 
						|
    # with iterator, empty where
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        expected = tm.makeTimeDataFrame(100064, "S")
 | 
						|
        _maybe_remove(store, "df")
 | 
						|
        store.append("df", expected)
 | 
						|
 | 
						|
        end_dt = expected.index[-1]
 | 
						|
 | 
						|
        # select w/iterator and where clause, single term, begin of range
 | 
						|
        where = f"index > '{end_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
        assert 0 == len(results)
 | 
						|
 | 
						|
 | 
						|
def test_select_iterator_many_empty_frames(setup_path):
 | 
						|
 | 
						|
    # GH 8014
 | 
						|
    # using iterator and where clause can return many empty
 | 
						|
    # frames.
 | 
						|
    chunksize = 10_000
 | 
						|
 | 
						|
    # with iterator, range limited to the first chunk
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        expected = tm.makeTimeDataFrame(100000, "S")
 | 
						|
        _maybe_remove(store, "df")
 | 
						|
        store.append("df", expected)
 | 
						|
 | 
						|
        beg_dt = expected.index[0]
 | 
						|
        end_dt = expected.index[chunksize - 1]
 | 
						|
 | 
						|
        # select w/iterator and where clause, single term, begin of range
 | 
						|
        where = f"index >= '{beg_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
        result = concat(results)
 | 
						|
        rexpected = expected[expected.index >= beg_dt]
 | 
						|
        tm.assert_frame_equal(rexpected, result)
 | 
						|
 | 
						|
        # select w/iterator and where clause, single term, end of range
 | 
						|
        where = f"index <= '{end_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
 | 
						|
        assert len(results) == 1
 | 
						|
        result = concat(results)
 | 
						|
        rexpected = expected[expected.index <= end_dt]
 | 
						|
        tm.assert_frame_equal(rexpected, result)
 | 
						|
 | 
						|
        # select w/iterator and where clause, inclusive range
 | 
						|
        where = f"index >= '{beg_dt}' & index <= '{end_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
 | 
						|
        # should be 1, is 10
 | 
						|
        assert len(results) == 1
 | 
						|
        result = concat(results)
 | 
						|
        rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
 | 
						|
        tm.assert_frame_equal(rexpected, result)
 | 
						|
 | 
						|
        # select w/iterator and where clause which selects
 | 
						|
        # *nothing*.
 | 
						|
        #
 | 
						|
        # To be consistent with Python idiom I suggest this should
 | 
						|
        # return [] e.g. `for e in []: print True` never prints
 | 
						|
        # True.
 | 
						|
 | 
						|
        where = f"index <= '{beg_dt}' & index >= '{end_dt}'"
 | 
						|
        results = list(store.select("df", where=where, chunksize=chunksize))
 | 
						|
 | 
						|
        # should be []
 | 
						|
        assert len(results) == 0
 | 
						|
 | 
						|
 | 
						|
def test_frame_select(setup_path):
 | 
						|
 | 
						|
    df = tm.makeTimeDataFrame()
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.put("frame", df, format="table")
 | 
						|
        date = df.index[len(df) // 2]
 | 
						|
 | 
						|
        crit1 = Term("index>=date")
 | 
						|
        assert crit1.env.scope["date"] == date
 | 
						|
 | 
						|
        crit2 = "columns=['A', 'D']"
 | 
						|
        crit3 = "columns=A"
 | 
						|
 | 
						|
        result = store.select("frame", [crit1, crit2])
 | 
						|
        expected = df.loc[date:, ["A", "D"]]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = store.select("frame", [crit3])
 | 
						|
        expected = df.loc[:, ["A"]]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # invalid terms
 | 
						|
        df = tm.makeTimeDataFrame()
 | 
						|
        store.append("df_time", df)
 | 
						|
        msg = "could not convert string to Timestamp"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            store.select("df_time", "index>0")
 | 
						|
 | 
						|
        # can't select if not written as table
 | 
						|
        # store['frame'] = df
 | 
						|
        # with pytest.raises(ValueError):
 | 
						|
        #     store.select('frame', [crit1, crit2])
 | 
						|
 | 
						|
 | 
						|
def test_frame_select_complex(setup_path):
 | 
						|
    # select via complex criteria
 | 
						|
 | 
						|
    df = tm.makeTimeDataFrame()
 | 
						|
    df["string"] = "foo"
 | 
						|
    df.loc[df.index[0:4], "string"] = "bar"
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.put("df", df, format="table", data_columns=["string"])
 | 
						|
 | 
						|
        # empty
 | 
						|
        result = store.select("df", 'index>df.index[3] & string="bar"')
 | 
						|
        expected = df.loc[(df.index > df.index[3]) & (df.string == "bar")]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = store.select("df", 'index>df.index[3] & string="foo"')
 | 
						|
        expected = df.loc[(df.index > df.index[3]) & (df.string == "foo")]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # or
 | 
						|
        result = store.select("df", 'index>df.index[3] | string="bar"')
 | 
						|
        expected = df.loc[(df.index > df.index[3]) | (df.string == "bar")]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = store.select(
 | 
						|
            "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"'
 | 
						|
        )
 | 
						|
        expected = df.loc[
 | 
						|
            ((df.index > df.index[3]) & (df.index <= df.index[6]))
 | 
						|
            | (df.string == "bar")
 | 
						|
        ]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # invert
 | 
						|
        result = store.select("df", 'string!="bar"')
 | 
						|
        expected = df.loc[df.string != "bar"]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # invert not implemented in numexpr :(
 | 
						|
        msg = "cannot use an invert condition when passing to numexpr"
 | 
						|
        with pytest.raises(NotImplementedError, match=msg):
 | 
						|
            store.select("df", '~(string="bar")')
 | 
						|
 | 
						|
        # invert ok for filters
 | 
						|
        result = store.select("df", "~(columns=['A','B'])")
 | 
						|
        expected = df.loc[:, df.columns.difference(["A", "B"])]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # in
 | 
						|
        result = store.select("df", "index>df.index[3] & columns in ['A','B']")
 | 
						|
        expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_frame_select_complex2(setup_path):
 | 
						|
 | 
						|
    with ensure_clean_path(["params.hdf", "hist.hdf"]) as paths:
 | 
						|
 | 
						|
        pp, hh = paths
 | 
						|
 | 
						|
        # use non-trivial selection criteria
 | 
						|
        params = DataFrame({"A": [1, 1, 2, 2, 3]})
 | 
						|
        params.to_hdf(pp, "df", mode="w", format="table", data_columns=["A"])
 | 
						|
 | 
						|
        selection = read_hdf(pp, "df", where="A=[2,3]")
 | 
						|
        hist = DataFrame(
 | 
						|
            np.random.randn(25, 1),
 | 
						|
            columns=["data"],
 | 
						|
            index=MultiIndex.from_tuples(
 | 
						|
                [(i, j) for i in range(5) for j in range(5)], names=["l1", "l2"]
 | 
						|
            ),
 | 
						|
        )
 | 
						|
 | 
						|
        hist.to_hdf(hh, "df", mode="w", format="table")
 | 
						|
 | 
						|
        expected = read_hdf(hh, "df", where="l1=[2, 3, 4]")
 | 
						|
 | 
						|
        # scope with list like
 | 
						|
        l0 = selection.index.tolist()  # noqa:F841
 | 
						|
        store = HDFStore(hh)
 | 
						|
        result = store.select("df", where="l1=l0")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
        store.close()
 | 
						|
 | 
						|
        result = read_hdf(hh, "df", where="l1=l0")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # index
 | 
						|
        index = selection.index  # noqa:F841
 | 
						|
        result = read_hdf(hh, "df", where="l1=index")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = read_hdf(hh, "df", where="l1=selection.index")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = read_hdf(hh, "df", where="l1=selection.index.tolist()")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = read_hdf(hh, "df", where="l1=list(selection.index)")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # scope with index
 | 
						|
        store = HDFStore(hh)
 | 
						|
 | 
						|
        result = store.select("df", where="l1=index")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = store.select("df", where="l1=selection.index")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = store.select("df", where="l1=selection.index.tolist()")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = store.select("df", where="l1=list(selection.index)")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        store.close()
 | 
						|
 | 
						|
 | 
						|
def test_invalid_filtering(setup_path):
 | 
						|
 | 
						|
    # can't use more than one filter (atm)
 | 
						|
 | 
						|
    df = tm.makeTimeDataFrame()
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.put("df", df, format="table")
 | 
						|
 | 
						|
        msg = "unable to collapse Joint Filters"
 | 
						|
        # not implemented
 | 
						|
        with pytest.raises(NotImplementedError, match=msg):
 | 
						|
            store.select("df", "columns=['A'] | columns=['B']")
 | 
						|
 | 
						|
        # in theory we could deal with this
 | 
						|
        with pytest.raises(NotImplementedError, match=msg):
 | 
						|
            store.select("df", "columns=['A','B'] & columns=['C']")
 | 
						|
 | 
						|
 | 
						|
def test_string_select(setup_path):
 | 
						|
    # GH 2973
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        df = tm.makeTimeDataFrame()
 | 
						|
 | 
						|
        # test string ==/!=
 | 
						|
        df["x"] = "none"
 | 
						|
        df.loc[df.index[2:7], "x"] = ""
 | 
						|
 | 
						|
        store.append("df", df, data_columns=["x"])
 | 
						|
 | 
						|
        result = store.select("df", "x=none")
 | 
						|
        expected = df[df.x == "none"]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = store.select("df", "x!=none")
 | 
						|
        expected = df[df.x != "none"]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        df2 = df.copy()
 | 
						|
        df2.loc[df2.x == "", "x"] = np.nan
 | 
						|
 | 
						|
        store.append("df2", df2, data_columns=["x"])
 | 
						|
        result = store.select("df2", "x!=none")
 | 
						|
        expected = df2[isna(df2.x)]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # int ==/!=
 | 
						|
        df["int"] = 1
 | 
						|
        df.loc[df.index[2:7], "int"] = 2
 | 
						|
 | 
						|
        store.append("df3", df, data_columns=["int"])
 | 
						|
 | 
						|
        result = store.select("df3", "int=2")
 | 
						|
        expected = df[df.int == 2]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = store.select("df3", "int!=2")
 | 
						|
        expected = df[df.int != 2]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_select_as_multiple(setup_path):
 | 
						|
 | 
						|
    df1 = tm.makeTimeDataFrame()
 | 
						|
    df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
 | 
						|
    df2["foo"] = "bar"
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        msg = "keys must be a list/tuple"
 | 
						|
        # no tables stored
 | 
						|
        with pytest.raises(TypeError, match=msg):
 | 
						|
            store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
 | 
						|
 | 
						|
        store.append("df1", df1, data_columns=["A", "B"])
 | 
						|
        store.append("df2", df2)
 | 
						|
 | 
						|
        # exceptions
 | 
						|
        with pytest.raises(TypeError, match=msg):
 | 
						|
            store.select_as_multiple(None, where=["A>0", "B>0"], selector="df1")
 | 
						|
 | 
						|
        with pytest.raises(TypeError, match=msg):
 | 
						|
            store.select_as_multiple([None], where=["A>0", "B>0"], selector="df1")
 | 
						|
 | 
						|
        msg = "'No object named df3 in the file'"
 | 
						|
        with pytest.raises(KeyError, match=msg):
 | 
						|
            store.select_as_multiple(
 | 
						|
                ["df1", "df3"], where=["A>0", "B>0"], selector="df1"
 | 
						|
            )
 | 
						|
 | 
						|
        with pytest.raises(KeyError, match=msg):
 | 
						|
            store.select_as_multiple(["df3"], where=["A>0", "B>0"], selector="df1")
 | 
						|
 | 
						|
        with pytest.raises(KeyError, match="'No object named df4 in the file'"):
 | 
						|
            store.select_as_multiple(
 | 
						|
                ["df1", "df2"], where=["A>0", "B>0"], selector="df4"
 | 
						|
            )
 | 
						|
 | 
						|
        # default select
 | 
						|
        result = store.select("df1", ["A>0", "B>0"])
 | 
						|
        expected = store.select_as_multiple(
 | 
						|
            ["df1"], where=["A>0", "B>0"], selector="df1"
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
        expected = store.select_as_multiple("df1", where=["A>0", "B>0"], selector="df1")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # multiple
 | 
						|
        result = store.select_as_multiple(
 | 
						|
            ["df1", "df2"], where=["A>0", "B>0"], selector="df1"
 | 
						|
        )
 | 
						|
        expected = concat([df1, df2], axis=1)
 | 
						|
        expected = expected[(expected.A > 0) & (expected.B > 0)]
 | 
						|
        tm.assert_frame_equal(result, expected, check_freq=False)
 | 
						|
        # FIXME: 2021-01-20 this is failing with freq None vs 4B on some builds
 | 
						|
 | 
						|
        # multiple (diff selector)
 | 
						|
        result = store.select_as_multiple(
 | 
						|
            ["df1", "df2"], where="index>df2.index[4]", selector="df2"
 | 
						|
        )
 | 
						|
        expected = concat([df1, df2], axis=1)
 | 
						|
        expected = expected[5:]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # test exception for diff rows
 | 
						|
        store.append("df3", tm.makeTimeDataFrame(nper=50))
 | 
						|
        msg = "all tables must have exactly the same nrows!"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            store.select_as_multiple(
 | 
						|
                ["df1", "df3"], where=["A>0", "B>0"], selector="df1"
 | 
						|
            )
 | 
						|
 | 
						|
 | 
						|
def test_nan_selection_bug_4858(setup_path):
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        df = DataFrame({"cols": range(6), "values": range(6)}, dtype="float64")
 | 
						|
        df["cols"] = (df["cols"] + 10).apply(str)
 | 
						|
        df.iloc[0] = np.nan
 | 
						|
 | 
						|
        expected = DataFrame(
 | 
						|
            {"cols": ["13.0", "14.0", "15.0"], "values": [3.0, 4.0, 5.0]},
 | 
						|
            index=[3, 4, 5],
 | 
						|
        )
 | 
						|
 | 
						|
        # write w/o the index on that particular column
 | 
						|
        store.append("df", df, data_columns=True, index=["cols"])
 | 
						|
        result = store.select("df", where="values>2.0")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_query_with_nested_special_character(setup_path):
 | 
						|
    df = DataFrame(
 | 
						|
        {
 | 
						|
            "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"],
 | 
						|
            "b": [1, 2, 3, 4, 5, 6, 7, 8],
 | 
						|
        }
 | 
						|
    )
 | 
						|
    expected = df[df.a == "test & test"]
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.append("test", df, format="table", data_columns=True)
 | 
						|
        result = store.select("test", 'a = "test & test"')
 | 
						|
    tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
 | 
						|
def test_query_long_float_literal(setup_path):
 | 
						|
    # GH 14241
 | 
						|
    df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]})
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.append("test", df, format="table", data_columns=True)
 | 
						|
 | 
						|
        cutoff = 1000000000.0006
 | 
						|
        result = store.select("test", f"A < {cutoff:.4f}")
 | 
						|
        assert result.empty
 | 
						|
 | 
						|
        cutoff = 1000000000.0010
 | 
						|
        result = store.select("test", f"A > {cutoff:.4f}")
 | 
						|
        expected = df.loc[[1, 2], :]
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        exact = 1000000000.0011
 | 
						|
        result = store.select("test", f"A == {exact:.4f}")
 | 
						|
        expected = df.loc[[1], :]
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
 | 
						|
def test_query_compare_column_type(setup_path):
 | 
						|
    # GH 15492
 | 
						|
    df = DataFrame(
 | 
						|
        {
 | 
						|
            "date": ["2014-01-01", "2014-01-02"],
 | 
						|
            "real_date": date_range("2014-01-01", periods=2),
 | 
						|
            "float": [1.1, 1.2],
 | 
						|
            "int": [1, 2],
 | 
						|
        },
 | 
						|
        columns=["date", "real_date", "float", "int"],
 | 
						|
    )
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
        store.append("test", df, format="table", data_columns=True)
 | 
						|
 | 
						|
        ts = Timestamp("2014-01-01")  # noqa:F841
 | 
						|
        result = store.select("test", where="real_date > ts")
 | 
						|
        expected = df.loc[[1], :]
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
        for op in ["<", ">", "=="]:
 | 
						|
            # non strings to string column always fail
 | 
						|
            for v in [2.1, True, Timestamp("2014-01-01"), pd.Timedelta(1, "s")]:
 | 
						|
                query = f"date {op} v"
 | 
						|
                msg = f"Cannot compare {v} of type {type(v)} to string column"
 | 
						|
                with pytest.raises(TypeError, match=msg):
 | 
						|
                    store.select("test", where=query)
 | 
						|
 | 
						|
            # strings to other columns must be convertible to type
 | 
						|
            v = "a"
 | 
						|
            for col in ["int", "float", "real_date"]:
 | 
						|
                query = f"{col} {op} v"
 | 
						|
                msg = "could not convert string to "
 | 
						|
                with pytest.raises(ValueError, match=msg):
 | 
						|
                    store.select("test", where=query)
 | 
						|
 | 
						|
            for v, col in zip(
 | 
						|
                ["1", "1.1", "2014-01-01"], ["int", "float", "real_date"]
 | 
						|
            ):
 | 
						|
                query = f"{col} {op} v"
 | 
						|
                result = store.select("test", where=query)
 | 
						|
 | 
						|
                if op == "==":
 | 
						|
                    expected = df.loc[[0], :]
 | 
						|
                elif op == ">":
 | 
						|
                    expected = df.loc[[1], :]
 | 
						|
                else:
 | 
						|
                    expected = df.loc[[], :]
 | 
						|
                tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("where", ["", (), (None,), [], [None]])
 | 
						|
def test_select_empty_where(where):
 | 
						|
    # GH26610
 | 
						|
 | 
						|
    df = DataFrame([1, 2, 3])
 | 
						|
    with ensure_clean_path("empty_where.h5") as path:
 | 
						|
        with HDFStore(path) as store:
 | 
						|
            store.put("df", df, "t")
 | 
						|
            result = read_hdf(store, "df", where=where)
 | 
						|
            tm.assert_frame_equal(result, df)
 |