Feature-Extraction/dist/client/pandas/tests/frame/conftest.py

import numpy as np
import pytest

from pandas import (
    DataFrame,
    NaT,
    date_range,
)
import pandas._testing as tm


@pytest.fixture
def float_frame_with_na():
    """
    Fixture for DataFrame of floats with index of unique strings

    Columns are ['A', 'B', 'C', 'D']; some entries are missing

                       A         B         C         D
    ABwBzA0ljw -1.128865 -0.897161  0.046603  0.274997
    DJiRzmbyQF  0.728869  0.233502  0.722431 -0.890872
    neMgPD5UBF  0.486072 -1.027393 -0.031553  1.449522
    0yWA4n8VeX -1.937191 -1.142531  0.805215 -0.462018
    3slYUbbqU1  0.153260  1.164691  1.489795 -0.545826
    soujjZ0A08       NaN       NaN       NaN       NaN
    7W6NLGsjB9       NaN       NaN       NaN       NaN
    ...              ...       ...       ...       ...
    uhfeaNkCR1 -0.231210 -0.340472  0.244717 -0.901590
    n6p7GYuBIV -0.419052  1.922721 -0.125361 -0.727717
    ZhzAeY6p1y  1.234374 -1.425359 -0.827038 -0.633189
    uWdPsORyUh  0.046738 -0.980445 -1.102965  0.605503
    3DJA6aN590 -0.091018 -1.684734 -1.100900  0.215947
    2GBPAzdbMk -2.883405 -1.021071  1.209877  1.633083
    sHadBoyVHw -2.223032 -0.326384  0.258931  0.245517

    [30 rows x 4 columns]
    """
    df = DataFrame(tm.getSeriesData())
    # set some NAs
    df.iloc[5:10] = np.nan
    df.iloc[15:20, -2:] = np.nan
    return df


@pytest.fixture
def bool_frame_with_na():
    """
    Fixture for DataFrame of booleans with index of unique strings

    Columns are ['A', 'B', 'C', 'D']; some entries are missing

                    A      B      C      D
    zBZxY2IDGd  False  False  False  False
    IhBWBMWllt  False   True   True   True
    ctjdvZSR6R   True  False   True   True
    AVTujptmxb  False   True  False   True
    G9lrImrSWq  False  False  False   True
    sFFwdIUfz2    NaN    NaN    NaN    NaN
    s15ptEJnRb    NaN    NaN    NaN    NaN
    ...           ...    ...    ...    ...
    UW41KkDyZ4   True   True  False  False
    l9l6XkOdqV   True  False  False  False
    X2MeZfzDYA  False   True  False  False
    xWkIKU7vfX  False   True  False   True
    QOhL6VmpGU  False  False  False   True
    22PwkRJdat  False   True  False  False
    kfboQ3VeIK   True  False   True  False

    [30 rows x 4 columns]
    """
    df = DataFrame(tm.getSeriesData()) > 0
    df = df.astype(object)
    # set some NAs
    df.iloc[5:10] = np.nan
    df.iloc[15:20, -2:] = np.nan

    # For `any` tests we need to have at least one True before the first NaN
    #  in each column
    for i in range(4):
        df.iloc[i, i] = True
    return df


@pytest.fixture
def float_string_frame():
    """
    Fixture for DataFrame of floats and strings with index of unique strings

    Columns are ['A', 'B', 'C', 'D', 'foo'].

                       A         B         C         D  foo
    w3orJvq07g -1.594062 -1.084273 -1.252457  0.356460  bar
    PeukuVdmz2  0.109855 -0.955086 -0.809485  0.409747  bar
    ahp2KvwiM8 -1.533729 -0.142519 -0.154666  1.302623  bar
    3WSJ7BUCGd  2.484964  0.213829  0.034778 -2.327831  bar
    khdAmufk0U -0.193480 -0.743518 -0.077987  0.153646  bar
    LE2DZiFlrE -0.193566 -1.343194 -0.107321  0.959978  bar
    HJXSJhVn7b  0.142590  1.257603 -0.659409 -0.223844  bar
    ...              ...       ...       ...       ...  ...
    9a1Vypttgw -1.316394  1.601354  0.173596  1.213196  bar
    h5d1gVFbEy  0.609475  1.106738 -0.155271  0.294630  bar
    mK9LsTQG92  1.303613  0.857040 -1.019153  0.369468  bar
    oOLksd9gKH  0.558219 -0.134491 -0.289869 -0.951033  bar
    9jgoOjKyHg  0.058270 -0.496110 -0.413212 -0.852659  bar
    jZLDHclHAO  0.096298  1.267510  0.549206 -0.005235  bar
    lR0nxDp1C2 -2.119350 -0.794384  0.544118  0.145849  bar

    [30 rows x 5 columns]
    """
    df = DataFrame(tm.getSeriesData())
    df["foo"] = "bar"
    return df


@pytest.fixture
def mixed_float_frame():
    """
    Fixture for DataFrame of different float types with index of unique strings

    Columns are ['A', 'B', 'C', 'D'].

                       A         B         C         D
    GI7bbDaEZe -0.237908 -0.246225 -0.468506  0.752993
    KGp9mFepzA -1.140809 -0.644046 -1.225586  0.801588
    VeVYLAb1l2 -1.154013 -1.677615  0.690430 -0.003731
    kmPME4WKhO  0.979578  0.998274 -0.776367  0.897607
    CPyopdXTiz  0.048119 -0.257174  0.836426  0.111266
    0kJZQndAj0  0.274357 -0.281135 -0.344238  0.834541
    tqdwQsaHG8 -0.979716 -0.519897  0.582031  0.144710
    ...              ...       ...       ...       ...
    7FhZTWILQj -2.906357  1.261039 -0.780273 -0.537237
    4pUDPM4eGq -2.042512 -0.464382 -0.382080  1.132612
    B8dUgUzwTi -1.506637 -0.364435  1.087891  0.297653
    hErlVYjVv9  1.477453 -0.495515 -0.713867  1.438427
    1BKN3o7YLs  0.127535 -0.349812 -0.881836  0.489827
    9S4Ekn7zga  1.445518 -2.095149  0.031982  0.373204
    xN1dNn6OV6  1.425017 -0.983995 -0.363281 -0.224502

    [30 rows x 4 columns]
    """
    df = DataFrame(tm.getSeriesData())
    df.A = df.A.astype("float32")
    df.B = df.B.astype("float32")
    df.C = df.C.astype("float16")
    df.D = df.D.astype("float64")
    return df


@pytest.fixture
def mixed_int_frame():
    """
    Fixture for DataFrame of different int types with index of unique strings

    Columns are ['A', 'B', 'C', 'D'].

                A  B    C    D
    mUrCZ67juP  0  1    2    2
    rw99ACYaKS  0  1    0    0
    7QsEcpaaVU  0  1    1    1
    xkrimI2pcE  0  1    0    0
    dz01SuzoS8  0  1  255  255
    ccQkqOHX75 -1  1    0    0
    DN0iXaoDLd  0  1    0    0
    ...        .. ..  ...  ...
    Dfb141wAaQ  1  1  254  254
    IPD8eQOVu5  0  1    0    0
    CcaKulsCmv  0  1    0    0
    rIBa8gu7E5  0  1    0    0
    RP6peZmh5o  0  1    1    1
    NMb9pipQWQ  0  1    0    0
    PqgbJEzjib  0  1    3    3

    [30 rows x 4 columns]
    """
    df = DataFrame({k: v.astype(int) for k, v in tm.getSeriesData().items()})
    df.A = df.A.astype("int32")
    df.B = np.ones(len(df.B), dtype="uint64")
    df.C = df.C.astype("uint8")
    df.D = df.C.astype("int64")
    return df


@pytest.fixture
def timezone_frame():
    """
    Fixture for DataFrame of date_range Series with different time zones

    Columns are ['A', 'B', 'C']; some entries are missing

               A                         B                         C
    0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00
    1 2013-01-02                       NaT                       NaT
    2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00
    """
    df = DataFrame(
        {
            "A": date_range("20130101", periods=3),
            "B": date_range("20130101", periods=3, tz="US/Eastern"),
            "C": date_range("20130101", periods=3, tz="CET"),
        }
    )
    df.iloc[1, 1] = NaT
    df.iloc[1, 2] = NaT
    return df


@pytest.fixture
def uint64_frame():
    """
    Fixture for DataFrame with uint64 values

    Columns are ['A', 'B']
    """
    return DataFrame(
        {"A": np.arange(3), "B": [2**63, 2**63 + 5, 2**63 + 10]}, dtype=np.uint64
    )


@pytest.fixture
def simple_frame():
    """
    Fixture for simple 3x3 DataFrame

    Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c'].

       one  two  three
    a  1.0  2.0    3.0
    b  4.0  5.0    6.0
    c  7.0  8.0    9.0
    """
    arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])

    return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"])


@pytest.fixture
def frame_of_index_cols():
    """
    Fixture for DataFrame of columns that can be used for indexing

    Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
    'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.

         A      B  C         D         E  (tuple, as, label)
    0  foo    one  a  0.608477 -0.012500           -1.664297
    1  foo    two  b -0.633460  0.249614           -0.364411
    2  foo  three  c  0.615256  2.154968           -0.834666
    3  bar    one  d  0.234246  1.085675            0.718445
    4  bar    two  e  0.533841 -0.005702           -3.533912
    """
    df = DataFrame(
        {
            "A": ["foo", "foo", "foo", "bar", "bar"],
            "B": ["one", "two", "three", "one", "two"],
            "C": ["a", "b", "c", "d", "e"],
            "D": np.random.randn(5),
            "E": np.random.randn(5),
            ("tuple", "as", "label"): np.random.randn(5),
        }
    )
    return df


@pytest.fixture(
    params=[
        "any",
        "all",
        "count",
        "sum",
        "prod",
        "max",
        "min",
        "mean",
        "median",
        "skew",
        "kurt",
        "sem",
        "var",
        "std",
        "mad",
    ]
)
def reduction_functions(request):
    return request.param