223 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			223 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
from pandas import (
 | 
						|
    Categorical,
 | 
						|
    DataFrame,
 | 
						|
    Series,
 | 
						|
    _testing as tm,
 | 
						|
    concat,
 | 
						|
    read_hdf,
 | 
						|
)
 | 
						|
from pandas.tests.io.pytables.common import (
 | 
						|
    _maybe_remove,
 | 
						|
    ensure_clean_path,
 | 
						|
    ensure_clean_store,
 | 
						|
)
 | 
						|
 | 
						|
pytestmark = [
 | 
						|
    pytest.mark.single_cpu,
 | 
						|
    # pytables https://github.com/PyTables/PyTables/issues/822
 | 
						|
    pytest.mark.filterwarnings(
 | 
						|
        "ignore:a closed node found in the registry:UserWarning"
 | 
						|
    ),
 | 
						|
]
 | 
						|
 | 
						|
 | 
						|
def test_categorical(setup_path):
 | 
						|
 | 
						|
    with ensure_clean_store(setup_path) as store:
 | 
						|
 | 
						|
        # Basic
 | 
						|
        _maybe_remove(store, "s")
 | 
						|
        s = Series(
 | 
						|
            Categorical(
 | 
						|
                ["a", "b", "b", "a", "a", "c"],
 | 
						|
                categories=["a", "b", "c", "d"],
 | 
						|
                ordered=False,
 | 
						|
            )
 | 
						|
        )
 | 
						|
        store.append("s", s, format="table")
 | 
						|
        result = store.select("s")
 | 
						|
        tm.assert_series_equal(s, result)
 | 
						|
 | 
						|
        _maybe_remove(store, "s_ordered")
 | 
						|
        s = Series(
 | 
						|
            Categorical(
 | 
						|
                ["a", "b", "b", "a", "a", "c"],
 | 
						|
                categories=["a", "b", "c", "d"],
 | 
						|
                ordered=True,
 | 
						|
            )
 | 
						|
        )
 | 
						|
        store.append("s_ordered", s, format="table")
 | 
						|
        result = store.select("s_ordered")
 | 
						|
        tm.assert_series_equal(s, result)
 | 
						|
 | 
						|
        _maybe_remove(store, "df")
 | 
						|
        df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
 | 
						|
        store.append("df", df, format="table")
 | 
						|
        result = store.select("df")
 | 
						|
        tm.assert_frame_equal(result, df)
 | 
						|
 | 
						|
        # Dtypes
 | 
						|
        _maybe_remove(store, "si")
 | 
						|
        s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category")
 | 
						|
        store.append("si", s)
 | 
						|
        result = store.select("si")
 | 
						|
        tm.assert_series_equal(result, s)
 | 
						|
 | 
						|
        _maybe_remove(store, "si2")
 | 
						|
        s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category")
 | 
						|
        store.append("si2", s)
 | 
						|
        result = store.select("si2")
 | 
						|
        tm.assert_series_equal(result, s)
 | 
						|
 | 
						|
        # Multiple
 | 
						|
        _maybe_remove(store, "df2")
 | 
						|
        df2 = df.copy()
 | 
						|
        df2["s2"] = Series(list("abcdefg")).astype("category")
 | 
						|
        store.append("df2", df2)
 | 
						|
        result = store.select("df2")
 | 
						|
        tm.assert_frame_equal(result, df2)
 | 
						|
 | 
						|
        # Make sure the metadata is OK
 | 
						|
        info = store.info()
 | 
						|
        assert "/df2   " in info
 | 
						|
        # df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical
 | 
						|
        assert "/df2/meta/values_block_0/meta" in info
 | 
						|
        assert "/df2/meta/values_block_2/meta" in info
 | 
						|
 | 
						|
        # unordered
 | 
						|
        _maybe_remove(store, "s2")
 | 
						|
        s = Series(
 | 
						|
            Categorical(
 | 
						|
                ["a", "b", "b", "a", "a", "c"],
 | 
						|
                categories=["a", "b", "c", "d"],
 | 
						|
                ordered=False,
 | 
						|
            )
 | 
						|
        )
 | 
						|
        store.append("s2", s, format="table")
 | 
						|
        result = store.select("s2")
 | 
						|
        tm.assert_series_equal(result, s)
 | 
						|
 | 
						|
        # Query
 | 
						|
        _maybe_remove(store, "df3")
 | 
						|
        store.append("df3", df, data_columns=["s"])
 | 
						|
        expected = df[df.s.isin(["b", "c"])]
 | 
						|
        result = store.select("df3", where=['s in ["b","c"]'])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        expected = df[df.s.isin(["b", "c"])]
 | 
						|
        result = store.select("df3", where=['s = ["b","c"]'])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        expected = df[df.s.isin(["d"])]
 | 
						|
        result = store.select("df3", where=['s in ["d"]'])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        expected = df[df.s.isin(["f"])]
 | 
						|
        result = store.select("df3", where=['s in ["f"]'])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # Appending with same categories is ok
 | 
						|
        store.append("df3", df)
 | 
						|
 | 
						|
        df = concat([df, df])
 | 
						|
        expected = df[df.s.isin(["b", "c"])]
 | 
						|
        result = store.select("df3", where=['s in ["b","c"]'])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # Appending must have the same categories
 | 
						|
        df3 = df.copy()
 | 
						|
        df3["s"] = df3["s"].cat.remove_unused_categories()
 | 
						|
 | 
						|
        msg = "cannot append a categorical with different categories to the existing"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            store.append("df3", df3)
 | 
						|
 | 
						|
        # Remove, and make sure meta data is removed (its a recursive
 | 
						|
        # removal so should be).
 | 
						|
        result = store.select("df3/meta/s/meta")
 | 
						|
        assert result is not None
 | 
						|
        store.remove("df3")
 | 
						|
 | 
						|
        with pytest.raises(
 | 
						|
            KeyError, match="'No object named df3/meta/s/meta in the file'"
 | 
						|
        ):
 | 
						|
            store.select("df3/meta/s/meta")
 | 
						|
 | 
						|
 | 
						|
def test_categorical_conversion(setup_path):
 | 
						|
 | 
						|
    # GH13322
 | 
						|
    # Check that read_hdf with categorical columns doesn't return rows if
 | 
						|
    # where criteria isn't met.
 | 
						|
    obsids = ["ESP_012345_6789", "ESP_987654_3210"]
 | 
						|
    imgids = ["APF00006np", "APF0001imm"]
 | 
						|
    data = [4.3, 9.8]
 | 
						|
 | 
						|
    # Test without categories
 | 
						|
    df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data})
 | 
						|
 | 
						|
    # We are expecting an empty DataFrame matching types of df
 | 
						|
    expected = df.iloc[[], :]
 | 
						|
    with ensure_clean_path(setup_path) as path:
 | 
						|
        df.to_hdf(path, "df", format="table", data_columns=True)
 | 
						|
        result = read_hdf(path, "df", where="obsids=B")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    # Test with categories
 | 
						|
    df.obsids = df.obsids.astype("category")
 | 
						|
    df.imgids = df.imgids.astype("category")
 | 
						|
 | 
						|
    # We are expecting an empty DataFrame matching types of df
 | 
						|
    expected = df.iloc[[], :]
 | 
						|
    with ensure_clean_path(setup_path) as path:
 | 
						|
        df.to_hdf(path, "df", format="table", data_columns=True)
 | 
						|
        result = read_hdf(path, "df", where="obsids=B")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_categorical_nan_only_columns(setup_path):
 | 
						|
    # GH18413
 | 
						|
    # Check that read_hdf with categorical columns with NaN-only values can
 | 
						|
    # be read back.
 | 
						|
    df = DataFrame(
 | 
						|
        {
 | 
						|
            "a": ["a", "b", "c", np.nan],
 | 
						|
            "b": [np.nan, np.nan, np.nan, np.nan],
 | 
						|
            "c": [1, 2, 3, 4],
 | 
						|
            "d": Series([None] * 4, dtype=object),
 | 
						|
        }
 | 
						|
    )
 | 
						|
    df["a"] = df.a.astype("category")
 | 
						|
    df["b"] = df.b.astype("category")
 | 
						|
    df["d"] = df.b.astype("category")
 | 
						|
    expected = df
 | 
						|
    with ensure_clean_path(setup_path) as path:
 | 
						|
        df.to_hdf(path, "df", format="table", data_columns=True)
 | 
						|
        result = read_hdf(path, "df")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "where, df, expected",
 | 
						|
    [
 | 
						|
        ('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})),
 | 
						|
        ('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_convert_value(setup_path, where: str, df: DataFrame, expected: DataFrame):
 | 
						|
    # GH39420
 | 
						|
    # Check that read_hdf with categorical columns can filter by where condition.
 | 
						|
    df.col = df.col.astype("category")
 | 
						|
    max_widths = {"col": 1}
 | 
						|
    categorical_values = sorted(df.col.unique())
 | 
						|
    expected.col = expected.col.astype("category")
 | 
						|
    expected.col = expected.col.cat.set_categories(categorical_values)
 | 
						|
 | 
						|
    with ensure_clean_path(setup_path) as path:
 | 
						|
        df.to_hdf(path, "df", format="table", min_itemsize=max_widths)
 | 
						|
        result = read_hdf(path, where=where)
 | 
						|
        tm.assert_frame_equal(result, expected)
 |