2654 lines
		
	
	
		
			90 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			2654 lines
		
	
	
		
			90 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from datetime import (
 | 
						|
    date,
 | 
						|
    datetime,
 | 
						|
    timedelta,
 | 
						|
)
 | 
						|
import random
 | 
						|
import re
 | 
						|
 | 
						|
import numpy as np
 | 
						|
import pytest
 | 
						|
 | 
						|
from pandas.core.dtypes.common import (
 | 
						|
    is_categorical_dtype,
 | 
						|
    is_object_dtype,
 | 
						|
)
 | 
						|
from pandas.core.dtypes.dtypes import CategoricalDtype
 | 
						|
 | 
						|
import pandas as pd
 | 
						|
from pandas import (
 | 
						|
    Categorical,
 | 
						|
    CategoricalIndex,
 | 
						|
    DataFrame,
 | 
						|
    DatetimeIndex,
 | 
						|
    IntervalIndex,
 | 
						|
    MultiIndex,
 | 
						|
    PeriodIndex,
 | 
						|
    RangeIndex,
 | 
						|
    Series,
 | 
						|
    TimedeltaIndex,
 | 
						|
)
 | 
						|
import pandas._testing as tm
 | 
						|
from pandas.api.types import CategoricalDtype as CDT
 | 
						|
from pandas.core.api import (
 | 
						|
    Float64Index,
 | 
						|
    Int64Index,
 | 
						|
    UInt64Index,
 | 
						|
)
 | 
						|
from pandas.core.reshape.concat import concat
 | 
						|
from pandas.core.reshape.merge import (
 | 
						|
    MergeError,
 | 
						|
    merge,
 | 
						|
)
 | 
						|
 | 
						|
N = 50
 | 
						|
NGROUPS = 8
 | 
						|
 | 
						|
 | 
						|
def get_test_data(ngroups=NGROUPS, n=N):
 | 
						|
    unique_groups = list(range(ngroups))
 | 
						|
    arr = np.asarray(np.tile(unique_groups, n // ngroups))
 | 
						|
 | 
						|
    if len(arr) < n:
 | 
						|
        arr = np.asarray(list(arr) + unique_groups[: n - len(arr)])
 | 
						|
 | 
						|
    random.shuffle(arr)
 | 
						|
    return arr
 | 
						|
 | 
						|
 | 
						|
def get_series():
 | 
						|
    return [
 | 
						|
        Series([1], dtype="int64"),
 | 
						|
        Series([1], dtype="Int64"),
 | 
						|
        Series([1.23]),
 | 
						|
        Series(["foo"]),
 | 
						|
        Series([True]),
 | 
						|
        Series([pd.Timestamp("2018-01-01")]),
 | 
						|
        Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]),
 | 
						|
    ]
 | 
						|
 | 
						|
 | 
						|
def get_series_na():
 | 
						|
    return [
 | 
						|
        Series([np.nan], dtype="Int64"),
 | 
						|
        Series([np.nan], dtype="float"),
 | 
						|
        Series([np.nan], dtype="object"),
 | 
						|
        Series([pd.NaT]),
 | 
						|
    ]
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name)
 | 
						|
def series_of_dtype(request):
 | 
						|
    """
 | 
						|
    A parametrized fixture returning a variety of Series of different
 | 
						|
    dtypes
 | 
						|
    """
 | 
						|
    return request.param
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name)
 | 
						|
def series_of_dtype2(request):
 | 
						|
    """
 | 
						|
    A duplicate of the series_of_dtype fixture, so that it can be used
 | 
						|
    twice by a single function
 | 
						|
    """
 | 
						|
    return request.param
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name)
 | 
						|
def series_of_dtype_all_na(request):
 | 
						|
    """
 | 
						|
    A parametrized fixture returning a variety of Series with all NA
 | 
						|
    values
 | 
						|
    """
 | 
						|
    return request.param
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def dfs_for_indicator():
 | 
						|
    df1 = DataFrame({"col1": [0, 1], "col_conflict": [1, 2], "col_left": ["a", "b"]})
 | 
						|
    df2 = DataFrame(
 | 
						|
        {
 | 
						|
            "col1": [1, 2, 3, 4, 5],
 | 
						|
            "col_conflict": [1, 2, 3, 4, 5],
 | 
						|
            "col_right": [2, 2, 2, 2, 2],
 | 
						|
        }
 | 
						|
    )
 | 
						|
    return df1, df2
 | 
						|
 | 
						|
 | 
						|
class TestMerge:
 | 
						|
    def setup_method(self, method):
 | 
						|
        # aggregate multiple columns
 | 
						|
        self.df = DataFrame(
 | 
						|
            {
 | 
						|
                "key1": get_test_data(),
 | 
						|
                "key2": get_test_data(),
 | 
						|
                "data1": np.random.randn(N),
 | 
						|
                "data2": np.random.randn(N),
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        # exclude a couple keys for fun
 | 
						|
        self.df = self.df[self.df["key2"] > 1]
 | 
						|
 | 
						|
        self.df2 = DataFrame(
 | 
						|
            {
 | 
						|
                "key1": get_test_data(n=N // 5),
 | 
						|
                "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5),
 | 
						|
                "value": np.random.randn(N // 5),
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        self.left = DataFrame(
 | 
						|
            {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}
 | 
						|
        )
 | 
						|
        self.right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])
 | 
						|
 | 
						|
    def test_merge_inner_join_empty(self):
 | 
						|
        # GH 15328
 | 
						|
        df_empty = DataFrame()
 | 
						|
        df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64")
 | 
						|
        result = merge(df_empty, df_a, left_index=True, right_index=True)
 | 
						|
        expected = DataFrame({"a": []}, index=[], dtype="int64")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_common(self):
 | 
						|
        joined = merge(self.df, self.df2)
 | 
						|
        exp = merge(self.df, self.df2, on=["key1", "key2"])
 | 
						|
        tm.assert_frame_equal(joined, exp)
 | 
						|
 | 
						|
    def test_merge_non_string_columns(self):
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/17962
 | 
						|
        # Checks that method runs for non string column names
 | 
						|
        left = DataFrame(
 | 
						|
            {0: [1, 0, 1, 0], 1: [0, 1, 0, 0], 2: [0, 0, 2, 0], 3: [1, 0, 0, 3]}
 | 
						|
        )
 | 
						|
 | 
						|
        right = left.astype(float)
 | 
						|
        expected = left
 | 
						|
        result = merge(left, right)
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
    def test_merge_index_as_on_arg(self):
 | 
						|
        # GH14355
 | 
						|
 | 
						|
        left = self.df.set_index("key1")
 | 
						|
        right = self.df2.set_index("key1")
 | 
						|
        result = merge(left, right, on="key1")
 | 
						|
        expected = merge(self.df, self.df2, on="key1").set_index("key1")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_index_singlekey_right_vs_left(self):
 | 
						|
        left = DataFrame(
 | 
						|
            {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}
 | 
						|
        )
 | 
						|
        right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])
 | 
						|
 | 
						|
        merged1 = merge(
 | 
						|
            left, right, left_on="key", right_index=True, how="left", sort=False
 | 
						|
        )
 | 
						|
        merged2 = merge(
 | 
						|
            right, left, right_on="key", left_index=True, how="right", sort=False
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
 | 
						|
 | 
						|
        merged1 = merge(
 | 
						|
            left, right, left_on="key", right_index=True, how="left", sort=True
 | 
						|
        )
 | 
						|
        merged2 = merge(
 | 
						|
            right, left, right_on="key", left_index=True, how="right", sort=True
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
 | 
						|
 | 
						|
    def test_merge_index_singlekey_inner(self):
 | 
						|
        left = DataFrame(
 | 
						|
            {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}
 | 
						|
        )
 | 
						|
        right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])
 | 
						|
 | 
						|
        # inner join
 | 
						|
        result = merge(left, right, left_on="key", right_index=True, how="inner")
 | 
						|
        expected = left.join(right, on="key").loc[result.index]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = merge(right, left, right_on="key", left_index=True, how="inner")
 | 
						|
        expected = left.join(right, on="key").loc[result.index]
 | 
						|
        tm.assert_frame_equal(result, expected.loc[:, result.columns])
 | 
						|
 | 
						|
    def test_merge_misspecified(self):
 | 
						|
        msg = "Must pass right_on or right_index=True"
 | 
						|
        with pytest.raises(pd.errors.MergeError, match=msg):
 | 
						|
            merge(self.left, self.right, left_index=True)
 | 
						|
        msg = "Must pass left_on or left_index=True"
 | 
						|
        with pytest.raises(pd.errors.MergeError, match=msg):
 | 
						|
            merge(self.left, self.right, right_index=True)
 | 
						|
 | 
						|
        msg = (
 | 
						|
            'Can only pass argument "on" OR "left_on" and "right_on", not '
 | 
						|
            "a combination of both"
 | 
						|
        )
 | 
						|
        with pytest.raises(pd.errors.MergeError, match=msg):
 | 
						|
            merge(self.left, self.left, left_on="key", on="key")
 | 
						|
 | 
						|
        msg = r"len\(right_on\) must equal len\(left_on\)"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            merge(self.df, self.df2, left_on=["key1"], right_on=["key1", "key2"])
 | 
						|
 | 
						|
    def test_index_and_on_parameters_confusion(self):
 | 
						|
        msg = "right_index parameter must be of type bool, not <class 'list'>"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            merge(
 | 
						|
                self.df,
 | 
						|
                self.df2,
 | 
						|
                how="left",
 | 
						|
                left_index=False,
 | 
						|
                right_index=["key1", "key2"],
 | 
						|
            )
 | 
						|
        msg = "left_index parameter must be of type bool, not <class 'list'>"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            merge(
 | 
						|
                self.df,
 | 
						|
                self.df2,
 | 
						|
                how="left",
 | 
						|
                left_index=["key1", "key2"],
 | 
						|
                right_index=False,
 | 
						|
            )
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            merge(
 | 
						|
                self.df,
 | 
						|
                self.df2,
 | 
						|
                how="left",
 | 
						|
                left_index=["key1", "key2"],
 | 
						|
                right_index=["key1", "key2"],
 | 
						|
            )
 | 
						|
 | 
						|
    def test_merge_overlap(self):
 | 
						|
        merged = merge(self.left, self.left, on="key")
 | 
						|
        exp_len = (self.left["key"].value_counts() ** 2).sum()
 | 
						|
        assert len(merged) == exp_len
 | 
						|
        assert "v1_x" in merged
 | 
						|
        assert "v1_y" in merged
 | 
						|
 | 
						|
    def test_merge_different_column_key_names(self):
 | 
						|
        left = DataFrame({"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
 | 
						|
        right = DataFrame({"rkey": ["foo", "bar", "qux", "foo"], "value": [5, 6, 7, 8]})
 | 
						|
 | 
						|
        merged = left.merge(
 | 
						|
            right, left_on="lkey", right_on="rkey", how="outer", sort=True
 | 
						|
        )
 | 
						|
 | 
						|
        exp = Series(["bar", "baz", "foo", "foo", "foo", "foo", np.nan], name="lkey")
 | 
						|
        tm.assert_series_equal(merged["lkey"], exp)
 | 
						|
 | 
						|
        exp = Series(["bar", np.nan, "foo", "foo", "foo", "foo", "qux"], name="rkey")
 | 
						|
        tm.assert_series_equal(merged["rkey"], exp)
 | 
						|
 | 
						|
        exp = Series([2, 3, 1, 1, 4, 4, np.nan], name="value_x")
 | 
						|
        tm.assert_series_equal(merged["value_x"], exp)
 | 
						|
 | 
						|
        exp = Series([6, np.nan, 5, 8, 5, 8, 7], name="value_y")
 | 
						|
        tm.assert_series_equal(merged["value_y"], exp)
 | 
						|
 | 
						|
    def test_merge_copy(self):
 | 
						|
        left = DataFrame({"a": 0, "b": 1}, index=range(10))
 | 
						|
        right = DataFrame({"c": "foo", "d": "bar"}, index=range(10))
 | 
						|
 | 
						|
        merged = merge(left, right, left_index=True, right_index=True, copy=True)
 | 
						|
 | 
						|
        merged["a"] = 6
 | 
						|
        assert (left["a"] == 0).all()
 | 
						|
 | 
						|
        merged["d"] = "peekaboo"
 | 
						|
        assert (right["d"] == "bar").all()
 | 
						|
 | 
						|
    def test_merge_nocopy(self, using_array_manager):
 | 
						|
        left = DataFrame({"a": 0, "b": 1}, index=range(10))
 | 
						|
        right = DataFrame({"c": "foo", "d": "bar"}, index=range(10))
 | 
						|
 | 
						|
        merged = merge(left, right, left_index=True, right_index=True, copy=False)
 | 
						|
 | 
						|
        assert np.shares_memory(merged["a"]._values, left["a"]._values)
 | 
						|
        assert np.shares_memory(merged["d"]._values, right["d"]._values)
 | 
						|
 | 
						|
    def test_intelligently_handle_join_key(self):
 | 
						|
        # #733, be a bit more 1337 about not returning unconsolidated DataFrame
 | 
						|
 | 
						|
        left = DataFrame(
 | 
						|
            {"key": [1, 1, 2, 2, 3], "value": list(range(5))}, columns=["value", "key"]
 | 
						|
        )
 | 
						|
        right = DataFrame({"key": [1, 1, 2, 3, 4, 5], "rvalue": list(range(6))})
 | 
						|
 | 
						|
        joined = merge(left, right, on="key", how="outer")
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "key": [1, 1, 1, 1, 2, 2, 3, 4, 5],
 | 
						|
                "value": np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]),
 | 
						|
                "rvalue": [0, 1, 0, 1, 2, 2, 3, 4, 5],
 | 
						|
            },
 | 
						|
            columns=["value", "key", "rvalue"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(joined, expected)
 | 
						|
 | 
						|
    def test_merge_join_key_dtype_cast(self):
 | 
						|
        # #8596
 | 
						|
 | 
						|
        df1 = DataFrame({"key": [1], "v1": [10]})
 | 
						|
        df2 = DataFrame({"key": [2], "v1": [20]})
 | 
						|
        df = merge(df1, df2, how="outer")
 | 
						|
        assert df["key"].dtype == "int64"
 | 
						|
 | 
						|
        df1 = DataFrame({"key": [True], "v1": [1]})
 | 
						|
        df2 = DataFrame({"key": [False], "v1": [0]})
 | 
						|
        df = merge(df1, df2, how="outer")
 | 
						|
 | 
						|
        # GH13169
 | 
						|
        # GH#40073
 | 
						|
        assert df["key"].dtype == "bool"
 | 
						|
 | 
						|
        df1 = DataFrame({"val": [1]})
 | 
						|
        df2 = DataFrame({"val": [2]})
 | 
						|
        lkey = np.array([1])
 | 
						|
        rkey = np.array([2])
 | 
						|
        df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer")
 | 
						|
        assert df["key_0"].dtype == "int64"
 | 
						|
 | 
						|
    def test_handle_join_key_pass_array(self):
 | 
						|
        left = DataFrame(
 | 
						|
            {"key": [1, 1, 2, 2, 3], "value": np.arange(5)},
 | 
						|
            columns=["value", "key"],
 | 
						|
            dtype="int64",
 | 
						|
        )
 | 
						|
        right = DataFrame({"rvalue": np.arange(6)}, dtype="int64")
 | 
						|
        key = np.array([1, 1, 2, 3, 4, 5], dtype="int64")
 | 
						|
 | 
						|
        merged = merge(left, right, left_on="key", right_on=key, how="outer")
 | 
						|
        merged2 = merge(right, left, left_on=key, right_on="key", how="outer")
 | 
						|
 | 
						|
        tm.assert_series_equal(merged["key"], merged2["key"])
 | 
						|
        assert merged["key"].notna().all()
 | 
						|
        assert merged2["key"].notna().all()
 | 
						|
 | 
						|
        left = DataFrame({"value": np.arange(5)}, columns=["value"])
 | 
						|
        right = DataFrame({"rvalue": np.arange(6)})
 | 
						|
        lkey = np.array([1, 1, 2, 2, 3])
 | 
						|
        rkey = np.array([1, 1, 2, 3, 4, 5])
 | 
						|
 | 
						|
        merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer")
 | 
						|
        tm.assert_series_equal(
 | 
						|
            merged["key_0"], Series([1, 1, 1, 1, 2, 2, 3, 4, 5], name="key_0")
 | 
						|
        )
 | 
						|
 | 
						|
        left = DataFrame({"value": np.arange(3)})
 | 
						|
        right = DataFrame({"rvalue": np.arange(6)})
 | 
						|
 | 
						|
        key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64)
 | 
						|
        merged = merge(left, right, left_index=True, right_on=key, how="outer")
 | 
						|
        tm.assert_series_equal(merged["key_0"], Series(key, name="key_0"))
 | 
						|
 | 
						|
    def test_no_overlap_more_informative_error(self):
 | 
						|
        dt = datetime.now()
 | 
						|
        df1 = DataFrame({"x": ["a"]}, index=[dt])
 | 
						|
 | 
						|
        df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt])
 | 
						|
 | 
						|
        msg = (
 | 
						|
            "No common columns to perform merge on. "
 | 
						|
            f"Merge options: left_on={None}, right_on={None}, "
 | 
						|
            f"left_index={False}, right_index={False}"
 | 
						|
        )
 | 
						|
 | 
						|
        with pytest.raises(MergeError, match=msg):
 | 
						|
            merge(df1, df2)
 | 
						|
 | 
						|
    def test_merge_non_unique_indexes(self):
 | 
						|
 | 
						|
        dt = datetime(2012, 5, 1)
 | 
						|
        dt2 = datetime(2012, 5, 2)
 | 
						|
        dt3 = datetime(2012, 5, 3)
 | 
						|
        dt4 = datetime(2012, 5, 4)
 | 
						|
 | 
						|
        df1 = DataFrame({"x": ["a"]}, index=[dt])
 | 
						|
        df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt])
 | 
						|
        _check_merge(df1, df2)
 | 
						|
 | 
						|
        # Not monotonic
 | 
						|
        df1 = DataFrame({"x": ["a", "b", "q"]}, index=[dt2, dt, dt4])
 | 
						|
        df2 = DataFrame(
 | 
						|
            {"y": ["c", "d", "e", "f", "g", "h"]}, index=[dt3, dt3, dt2, dt2, dt, dt]
 | 
						|
        )
 | 
						|
        _check_merge(df1, df2)
 | 
						|
 | 
						|
        df1 = DataFrame({"x": ["a", "b"]}, index=[dt, dt])
 | 
						|
        df2 = DataFrame({"y": ["c", "d"]}, index=[dt, dt])
 | 
						|
        _check_merge(df1, df2)
 | 
						|
 | 
						|
    def test_merge_non_unique_index_many_to_many(self):
 | 
						|
        dt = datetime(2012, 5, 1)
 | 
						|
        dt2 = datetime(2012, 5, 2)
 | 
						|
        dt3 = datetime(2012, 5, 3)
 | 
						|
        df1 = DataFrame({"x": ["a", "b", "c", "d"]}, index=[dt2, dt2, dt, dt])
 | 
						|
        df2 = DataFrame(
 | 
						|
            {"y": ["e", "f", "g", " h", "i"]}, index=[dt2, dt2, dt3, dt, dt]
 | 
						|
        )
 | 
						|
        _check_merge(df1, df2)
 | 
						|
 | 
						|
    def test_left_merge_empty_dataframe(self):
 | 
						|
        left = DataFrame({"key": [1], "value": [2]})
 | 
						|
        right = DataFrame({"key": []})
 | 
						|
 | 
						|
        result = merge(left, right, on="key", how="left")
 | 
						|
        tm.assert_frame_equal(result, left)
 | 
						|
 | 
						|
        result = merge(right, left, on="key", how="right")
 | 
						|
        tm.assert_frame_equal(result, left)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "kwarg",
 | 
						|
        [
 | 
						|
            {"left_index": True, "right_index": True},
 | 
						|
            {"left_index": True, "right_on": "x"},
 | 
						|
            {"left_on": "a", "right_index": True},
 | 
						|
            {"left_on": "a", "right_on": "x"},
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merge_left_empty_right_empty(self, join_type, kwarg):
 | 
						|
        # GH 10824
 | 
						|
        left = DataFrame(columns=["a", "b", "c"])
 | 
						|
        right = DataFrame(columns=["x", "y", "z"])
 | 
						|
 | 
						|
        exp_in = DataFrame(
 | 
						|
            columns=["a", "b", "c", "x", "y", "z"],
 | 
						|
            index=pd.Index([], dtype=object),
 | 
						|
            dtype=object,
 | 
						|
        )
 | 
						|
 | 
						|
        result = merge(left, right, how=join_type, **kwarg)
 | 
						|
        tm.assert_frame_equal(result, exp_in)
 | 
						|
 | 
						|
    def test_merge_left_empty_right_notempty(self):
 | 
						|
        # GH 10824
 | 
						|
        left = DataFrame(columns=["a", "b", "c"])
 | 
						|
        right = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["x", "y", "z"])
 | 
						|
 | 
						|
        exp_out = DataFrame(
 | 
						|
            {
 | 
						|
                "a": np.array([np.nan] * 3, dtype=object),
 | 
						|
                "b": np.array([np.nan] * 3, dtype=object),
 | 
						|
                "c": np.array([np.nan] * 3, dtype=object),
 | 
						|
                "x": [1, 4, 7],
 | 
						|
                "y": [2, 5, 8],
 | 
						|
                "z": [3, 6, 9],
 | 
						|
            },
 | 
						|
            columns=["a", "b", "c", "x", "y", "z"],
 | 
						|
        )
 | 
						|
        exp_in = exp_out[0:0]  # make empty DataFrame keeping dtype
 | 
						|
        # result will have object dtype
 | 
						|
        exp_in.index = exp_in.index.astype(object)
 | 
						|
 | 
						|
        def check1(exp, kwarg):
 | 
						|
            result = merge(left, right, how="inner", **kwarg)
 | 
						|
            tm.assert_frame_equal(result, exp)
 | 
						|
            result = merge(left, right, how="left", **kwarg)
 | 
						|
            tm.assert_frame_equal(result, exp)
 | 
						|
 | 
						|
        def check2(exp, kwarg):
 | 
						|
            result = merge(left, right, how="right", **kwarg)
 | 
						|
            tm.assert_frame_equal(result, exp)
 | 
						|
            result = merge(left, right, how="outer", **kwarg)
 | 
						|
            tm.assert_frame_equal(result, exp)
 | 
						|
 | 
						|
        for kwarg in [
 | 
						|
            {"left_index": True, "right_index": True},
 | 
						|
            {"left_index": True, "right_on": "x"},
 | 
						|
        ]:
 | 
						|
            check1(exp_in, kwarg)
 | 
						|
            check2(exp_out, kwarg)
 | 
						|
 | 
						|
        kwarg = {"left_on": "a", "right_index": True}
 | 
						|
        check1(exp_in, kwarg)
 | 
						|
        exp_out["a"] = [0, 1, 2]
 | 
						|
        check2(exp_out, kwarg)
 | 
						|
 | 
						|
        kwarg = {"left_on": "a", "right_on": "x"}
 | 
						|
        check1(exp_in, kwarg)
 | 
						|
        exp_out["a"] = np.array([np.nan] * 3, dtype=object)
 | 
						|
        check2(exp_out, kwarg)
 | 
						|
 | 
						|
    def test_merge_left_notempty_right_empty(self):
 | 
						|
        # GH 10824
 | 
						|
        left = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
 | 
						|
        right = DataFrame(columns=["x", "y", "z"])
 | 
						|
 | 
						|
        exp_out = DataFrame(
 | 
						|
            {
 | 
						|
                "a": [1, 4, 7],
 | 
						|
                "b": [2, 5, 8],
 | 
						|
                "c": [3, 6, 9],
 | 
						|
                "x": np.array([np.nan] * 3, dtype=object),
 | 
						|
                "y": np.array([np.nan] * 3, dtype=object),
 | 
						|
                "z": np.array([np.nan] * 3, dtype=object),
 | 
						|
            },
 | 
						|
            columns=["a", "b", "c", "x", "y", "z"],
 | 
						|
        )
 | 
						|
        exp_in = exp_out[0:0]  # make empty DataFrame keeping dtype
 | 
						|
        # result will have object dtype
 | 
						|
        exp_in.index = exp_in.index.astype(object)
 | 
						|
 | 
						|
        def check1(exp, kwarg):
 | 
						|
            result = merge(left, right, how="inner", **kwarg)
 | 
						|
            tm.assert_frame_equal(result, exp)
 | 
						|
            result = merge(left, right, how="right", **kwarg)
 | 
						|
            tm.assert_frame_equal(result, exp)
 | 
						|
 | 
						|
        def check2(exp, kwarg):
 | 
						|
            result = merge(left, right, how="left", **kwarg)
 | 
						|
            tm.assert_frame_equal(result, exp)
 | 
						|
            result = merge(left, right, how="outer", **kwarg)
 | 
						|
            tm.assert_frame_equal(result, exp)
 | 
						|
 | 
						|
            # TODO: should the next loop be un-indented? doing so breaks this test
 | 
						|
            for kwarg in [
 | 
						|
                {"left_index": True, "right_index": True},
 | 
						|
                {"left_index": True, "right_on": "x"},
 | 
						|
                {"left_on": "a", "right_index": True},
 | 
						|
                {"left_on": "a", "right_on": "x"},
 | 
						|
            ]:
 | 
						|
                check1(exp_in, kwarg)
 | 
						|
                check2(exp_out, kwarg)
 | 
						|
 | 
						|
    def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2):
 | 
						|
        # GH 25183
 | 
						|
        df = DataFrame(
 | 
						|
            {"key": series_of_dtype, "value": series_of_dtype2},
 | 
						|
            columns=["key", "value"],
 | 
						|
        )
 | 
						|
        df_empty = df[:0]
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "value_x": Series(dtype=df.dtypes["value"]),
 | 
						|
                "key": Series(dtype=df.dtypes["key"]),
 | 
						|
                "value_y": Series(dtype=df.dtypes["value"]),
 | 
						|
            },
 | 
						|
            columns=["value_x", "key", "value_y"],
 | 
						|
        )
 | 
						|
        actual = df_empty.merge(df, on="key")
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na):
 | 
						|
        # GH 25183
 | 
						|
        df_left = DataFrame(
 | 
						|
            {"key": series_of_dtype, "value": series_of_dtype_all_na},
 | 
						|
            columns=["key", "value"],
 | 
						|
        )
 | 
						|
        df_right = DataFrame(
 | 
						|
            {"key": series_of_dtype, "value": series_of_dtype_all_na},
 | 
						|
            columns=["key", "value"],
 | 
						|
        )
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "key": series_of_dtype,
 | 
						|
                "value_x": series_of_dtype_all_na,
 | 
						|
                "value_y": series_of_dtype_all_na,
 | 
						|
            },
 | 
						|
            columns=["key", "value_x", "value_y"],
 | 
						|
        )
 | 
						|
        actual = df_left.merge(df_right, on="key")
 | 
						|
        tm.assert_frame_equal(actual, expected)
 | 
						|
 | 
						|
    def test_merge_nosort(self):
 | 
						|
        # GH#2098
 | 
						|
 | 
						|
        d = {
 | 
						|
            "var1": np.random.randint(0, 10, size=10),
 | 
						|
            "var2": np.random.randint(0, 10, size=10),
 | 
						|
            "var3": [
 | 
						|
                datetime(2012, 1, 12),
 | 
						|
                datetime(2011, 2, 4),
 | 
						|
                datetime(2010, 2, 3),
 | 
						|
                datetime(2012, 1, 12),
 | 
						|
                datetime(2011, 2, 4),
 | 
						|
                datetime(2012, 4, 3),
 | 
						|
                datetime(2012, 3, 4),
 | 
						|
                datetime(2008, 5, 1),
 | 
						|
                datetime(2010, 2, 3),
 | 
						|
                datetime(2012, 2, 3),
 | 
						|
            ],
 | 
						|
        }
 | 
						|
        df = DataFrame.from_dict(d)
 | 
						|
        var3 = df.var3.unique()
 | 
						|
        var3.sort()
 | 
						|
        new = DataFrame.from_dict({"var3": var3, "var8": np.random.random(7)})
 | 
						|
 | 
						|
        result = df.merge(new, on="var3", sort=False)
 | 
						|
        exp = merge(df, new, on="var3", sort=False)
 | 
						|
        tm.assert_frame_equal(result, exp)
 | 
						|
 | 
						|
        assert (df.var3.unique() == result.var3.unique()).all()
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        ("sort", "values"), [(False, [1, 1, 0, 1, 1]), (True, [0, 1, 1, 1, 1])]
 | 
						|
    )
 | 
						|
    @pytest.mark.parametrize("how", ["left", "right"])
 | 
						|
    def test_merge_same_order_left_right(self, sort, values, how):
 | 
						|
        # GH#35382
 | 
						|
        df = DataFrame({"a": [1, 0, 1]})
 | 
						|
 | 
						|
        result = df.merge(df, on="a", how=how, sort=sort)
 | 
						|
        expected = DataFrame(values, columns=["a"])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_nan_right(self):
 | 
						|
        df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]})
 | 
						|
        df2 = DataFrame({"i1": [0], "i3": [0]})
 | 
						|
        result = df1.join(df2, on="i1", rsuffix="_")
 | 
						|
        expected = (
 | 
						|
            DataFrame(
 | 
						|
                {
 | 
						|
                    "i1": {0: 0.0, 1: 1},
 | 
						|
                    "i2": {0: 0, 1: 1},
 | 
						|
                    "i1_": {0: 0, 1: np.nan},
 | 
						|
                    "i3": {0: 0.0, 1: np.nan},
 | 
						|
                    None: {0: 0, 1: 0},
 | 
						|
                }
 | 
						|
            )
 | 
						|
            .set_index(None)
 | 
						|
            .reset_index()[["i1", "i2", "i1_", "i3"]]
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected, check_dtype=False)
 | 
						|
 | 
						|
    def test_merge_nan_right2(self):
 | 
						|
        df1 = DataFrame({"i1": [0, 1], "i2": [0.5, 1.5]})
 | 
						|
        df2 = DataFrame({"i1": [0], "i3": [0.7]})
 | 
						|
        result = df1.join(df2, rsuffix="_", on="i1")
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "i1": {0: 0, 1: 1},
 | 
						|
                "i1_": {0: 0.0, 1: np.nan},
 | 
						|
                "i2": {0: 0.5, 1: 1.5},
 | 
						|
                "i3": {0: 0.69999999999999996, 1: np.nan},
 | 
						|
            }
 | 
						|
        )[["i1", "i2", "i1_", "i3"]]
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_type(self):
 | 
						|
        class NotADataFrame(DataFrame):
 | 
						|
            @property
 | 
						|
            def _constructor(self):
 | 
						|
                return NotADataFrame
 | 
						|
 | 
						|
        nad = NotADataFrame(self.df)
 | 
						|
        result = nad.merge(self.df2, on="key1")
 | 
						|
 | 
						|
        assert isinstance(result, NotADataFrame)
 | 
						|
 | 
						|
    def test_join_append_timedeltas(self, using_array_manager):
 | 
						|
        # timedelta64 issues with join/merge
 | 
						|
        # GH 5695
 | 
						|
 | 
						|
        d = DataFrame.from_dict(
 | 
						|
            {"d": [datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500)]}
 | 
						|
        )
 | 
						|
        df = DataFrame(columns=list("dt"))
 | 
						|
        df = concat([df, d], ignore_index=True)
 | 
						|
        result = concat([df, d], ignore_index=True)
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)],
 | 
						|
                "t": [timedelta(0, 22500), timedelta(0, 22500)],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        if using_array_manager:
 | 
						|
            # TODO(ArrayManager) decide on exact casting rules in concat
 | 
						|
            expected = expected.astype(object)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_join_append_timedeltas2(self):
 | 
						|
        # timedelta64 issues with join/merge
 | 
						|
        # GH 5695
 | 
						|
        td = np.timedelta64(300000000)
 | 
						|
        lhs = DataFrame(Series([td, td], index=["A", "B"]))
 | 
						|
        rhs = DataFrame(Series([td], index=["A"]))
 | 
						|
 | 
						|
        result = lhs.join(rhs, rsuffix="r", how="left")
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "0": Series([td, td], index=list("AB")),
 | 
						|
                "0r": Series([td, pd.NaT], index=list("AB")),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_other_datetime_unit(self):
 | 
						|
        # GH 13389
 | 
						|
        df1 = DataFrame({"entity_id": [101, 102]})
 | 
						|
        s = Series([None, None], index=[101, 102], name="days")
 | 
						|
 | 
						|
        for dtype in [
 | 
						|
            "datetime64[D]",
 | 
						|
            "datetime64[h]",
 | 
						|
            "datetime64[m]",
 | 
						|
            "datetime64[s]",
 | 
						|
            "datetime64[ms]",
 | 
						|
            "datetime64[us]",
 | 
						|
            "datetime64[ns]",
 | 
						|
        ]:
 | 
						|
 | 
						|
            df2 = s.astype(dtype).to_frame("days")
 | 
						|
            # coerces to datetime64[ns], thus should not be affected
 | 
						|
            assert df2["days"].dtype == "datetime64[ns]"
 | 
						|
 | 
						|
            result = df1.merge(df2, left_on="entity_id", right_index=True)
 | 
						|
 | 
						|
            exp = DataFrame(
 | 
						|
                {
 | 
						|
                    "entity_id": [101, 102],
 | 
						|
                    "days": np.array(["nat", "nat"], dtype="datetime64[ns]"),
 | 
						|
                },
 | 
						|
                columns=["entity_id", "days"],
 | 
						|
            )
 | 
						|
            tm.assert_frame_equal(result, exp)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"])
 | 
						|
    def test_other_timedelta_unit(self, unit):
 | 
						|
        # GH 13389
 | 
						|
        df1 = DataFrame({"entity_id": [101, 102]})
 | 
						|
        s = Series([None, None], index=[101, 102], name="days")
 | 
						|
 | 
						|
        dtype = f"m8[{unit}]"
 | 
						|
        df2 = s.astype(dtype).to_frame("days")
 | 
						|
        assert df2["days"].dtype == "m8[ns]"
 | 
						|
 | 
						|
        result = df1.merge(df2, left_on="entity_id", right_index=True)
 | 
						|
 | 
						|
        exp = DataFrame(
 | 
						|
            {"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)},
 | 
						|
            columns=["entity_id", "days"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, exp)
 | 
						|
 | 
						|
    def test_overlapping_columns_error_message(self):
 | 
						|
        df = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]})
 | 
						|
        df2 = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]})
 | 
						|
 | 
						|
        df.columns = ["key", "foo", "foo"]
 | 
						|
        df2.columns = ["key", "bar", "bar"]
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "key": [1, 2, 3],
 | 
						|
                "v1": [4, 5, 6],
 | 
						|
                "v2": [7, 8, 9],
 | 
						|
                "v3": [4, 5, 6],
 | 
						|
                "v4": [7, 8, 9],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        expected.columns = ["key", "foo", "foo", "bar", "bar"]
 | 
						|
        tm.assert_frame_equal(merge(df, df2), expected)
 | 
						|
 | 
						|
        # #2649, #10639
 | 
						|
        df2.columns = ["key1", "foo", "foo"]
 | 
						|
        msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)"
 | 
						|
        with pytest.raises(MergeError, match=msg):
 | 
						|
            merge(df, df2)
 | 
						|
 | 
						|
    def test_merge_on_datetime64tz(self):
 | 
						|
 | 
						|
        # GH11405
 | 
						|
        left = DataFrame(
 | 
						|
            {
 | 
						|
                "key": pd.date_range("20151010", periods=2, tz="US/Eastern"),
 | 
						|
                "value": [1, 2],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        right = DataFrame(
 | 
						|
            {
 | 
						|
                "key": pd.date_range("20151011", periods=3, tz="US/Eastern"),
 | 
						|
                "value": [1, 2, 3],
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "key": pd.date_range("20151010", periods=4, tz="US/Eastern"),
 | 
						|
                "value_x": [1, 2, np.nan, np.nan],
 | 
						|
                "value_y": [np.nan, 1, 2, 3],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        result = merge(left, right, on="key", how="outer")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_datetime64tz_values(self):
 | 
						|
        left = DataFrame(
 | 
						|
            {
 | 
						|
                "key": [1, 2],
 | 
						|
                "value": pd.date_range("20151010", periods=2, tz="US/Eastern"),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        right = DataFrame(
 | 
						|
            {
 | 
						|
                "key": [2, 3],
 | 
						|
                "value": pd.date_range("20151011", periods=2, tz="US/Eastern"),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "key": [1, 2, 3],
 | 
						|
                "value_x": list(pd.date_range("20151010", periods=2, tz="US/Eastern"))
 | 
						|
                + [pd.NaT],
 | 
						|
                "value_y": [pd.NaT]
 | 
						|
                + list(pd.date_range("20151011", periods=2, tz="US/Eastern")),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        result = merge(left, right, on="key", how="outer")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
        assert result["value_x"].dtype == "datetime64[ns, US/Eastern]"
 | 
						|
        assert result["value_y"].dtype == "datetime64[ns, US/Eastern]"
 | 
						|
 | 
						|
    def test_merge_on_datetime64tz_empty(self):
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/25014
 | 
						|
        dtz = pd.DatetimeTZDtype(tz="UTC")
 | 
						|
        right = DataFrame(
 | 
						|
            {
 | 
						|
                "date": [pd.Timestamp("2018", tz=dtz.tz)],
 | 
						|
                "value": [4.0],
 | 
						|
                "date2": [pd.Timestamp("2019", tz=dtz.tz)],
 | 
						|
            },
 | 
						|
            columns=["date", "value", "date2"],
 | 
						|
        )
 | 
						|
        left = right[:0]
 | 
						|
        result = left.merge(right, on="date")
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "value_x": Series(dtype=float),
 | 
						|
                "date2_x": Series(dtype=dtz),
 | 
						|
                "date": Series(dtype=dtz),
 | 
						|
                "value_y": Series(dtype=float),
 | 
						|
                "date2_y": Series(dtype=dtz),
 | 
						|
            },
 | 
						|
            columns=["value_x", "date2_x", "date", "value_y", "date2_y"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_datetime64tz_with_dst_transition(self):
 | 
						|
        # GH 18885
 | 
						|
        df1 = DataFrame(
 | 
						|
            pd.date_range("2017-10-29 01:00", periods=4, freq="H", tz="Europe/Madrid"),
 | 
						|
            columns=["date"],
 | 
						|
        )
 | 
						|
        df1["value"] = 1
 | 
						|
        df2 = DataFrame(
 | 
						|
            {
 | 
						|
                "date": pd.to_datetime(
 | 
						|
                    [
 | 
						|
                        "2017-10-29 03:00:00",
 | 
						|
                        "2017-10-29 04:00:00",
 | 
						|
                        "2017-10-29 05:00:00",
 | 
						|
                    ]
 | 
						|
                ),
 | 
						|
                "value": 2,
 | 
						|
            }
 | 
						|
        )
 | 
						|
        df2["date"] = df2["date"].dt.tz_localize("UTC").dt.tz_convert("Europe/Madrid")
 | 
						|
        result = merge(df1, df2, how="outer", on="date")
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "date": pd.date_range(
 | 
						|
                    "2017-10-29 01:00", periods=7, freq="H", tz="Europe/Madrid"
 | 
						|
                ),
 | 
						|
                "value_x": [1] * 4 + [np.nan] * 3,
 | 
						|
                "value_y": [np.nan] * 4 + [2] * 3,
 | 
						|
            }
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_non_unique_period_index(self):
 | 
						|
        # GH #16871
 | 
						|
        index = pd.period_range("2016-01-01", periods=16, freq="M")
 | 
						|
        df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
 | 
						|
        df2 = concat([df, df])
 | 
						|
        result = df.merge(df2, left_index=True, right_index=True, how="inner")
 | 
						|
        expected = DataFrame(
 | 
						|
            np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
 | 
						|
            columns=["pnum_x", "pnum_y"],
 | 
						|
            index=df2.sort_index().index,
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_on_periods(self):
 | 
						|
        left = DataFrame(
 | 
						|
            {"key": pd.period_range("20151010", periods=2, freq="D"), "value": [1, 2]}
 | 
						|
        )
 | 
						|
        right = DataFrame(
 | 
						|
            {
 | 
						|
                "key": pd.period_range("20151011", periods=3, freq="D"),
 | 
						|
                "value": [1, 2, 3],
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "key": pd.period_range("20151010", periods=4, freq="D"),
 | 
						|
                "value_x": [1, 2, np.nan, np.nan],
 | 
						|
                "value_y": [np.nan, 1, 2, 3],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        result = merge(left, right, on="key", how="outer")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_period_values(self):
 | 
						|
        left = DataFrame(
 | 
						|
            {"key": [1, 2], "value": pd.period_range("20151010", periods=2, freq="D")}
 | 
						|
        )
 | 
						|
        right = DataFrame(
 | 
						|
            {"key": [2, 3], "value": pd.period_range("20151011", periods=2, freq="D")}
 | 
						|
        )
 | 
						|
 | 
						|
        exp_x = pd.period_range("20151010", periods=2, freq="D")
 | 
						|
        exp_y = pd.period_range("20151011", periods=2, freq="D")
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "key": [1, 2, 3],
 | 
						|
                "value_x": list(exp_x) + [pd.NaT],
 | 
						|
                "value_y": [pd.NaT] + list(exp_y),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        result = merge(left, right, on="key", how="outer")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
        assert result["value_x"].dtype == "Period[D]"
 | 
						|
        assert result["value_y"].dtype == "Period[D]"
 | 
						|
 | 
						|
    def test_indicator(self, dfs_for_indicator):
 | 
						|
        # PR #10054. xref #7412 and closes #8790.
 | 
						|
        df1, df2 = dfs_for_indicator
 | 
						|
        df1_copy = df1.copy()
 | 
						|
 | 
						|
        df2_copy = df2.copy()
 | 
						|
 | 
						|
        df_result = DataFrame(
 | 
						|
            {
 | 
						|
                "col1": [0, 1, 2, 3, 4, 5],
 | 
						|
                "col_conflict_x": [1, 2, np.nan, np.nan, np.nan, np.nan],
 | 
						|
                "col_left": ["a", "b", np.nan, np.nan, np.nan, np.nan],
 | 
						|
                "col_conflict_y": [np.nan, 1, 2, 3, 4, 5],
 | 
						|
                "col_right": [np.nan, 2, 2, 2, 2, 2],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        df_result["_merge"] = Categorical(
 | 
						|
            [
 | 
						|
                "left_only",
 | 
						|
                "both",
 | 
						|
                "right_only",
 | 
						|
                "right_only",
 | 
						|
                "right_only",
 | 
						|
                "right_only",
 | 
						|
            ],
 | 
						|
            categories=["left_only", "right_only", "both"],
 | 
						|
        )
 | 
						|
 | 
						|
        df_result = df_result[
 | 
						|
            [
 | 
						|
                "col1",
 | 
						|
                "col_conflict_x",
 | 
						|
                "col_left",
 | 
						|
                "col_conflict_y",
 | 
						|
                "col_right",
 | 
						|
                "_merge",
 | 
						|
            ]
 | 
						|
        ]
 | 
						|
 | 
						|
        test = merge(df1, df2, on="col1", how="outer", indicator=True)
 | 
						|
        tm.assert_frame_equal(test, df_result)
 | 
						|
        test = df1.merge(df2, on="col1", how="outer", indicator=True)
 | 
						|
        tm.assert_frame_equal(test, df_result)
 | 
						|
 | 
						|
        # No side effects
 | 
						|
        tm.assert_frame_equal(df1, df1_copy)
 | 
						|
        tm.assert_frame_equal(df2, df2_copy)
 | 
						|
 | 
						|
        # Check with custom name
 | 
						|
        df_result_custom_name = df_result
 | 
						|
        df_result_custom_name = df_result_custom_name.rename(
 | 
						|
            columns={"_merge": "custom_name"}
 | 
						|
        )
 | 
						|
 | 
						|
        test_custom_name = merge(
 | 
						|
            df1, df2, on="col1", how="outer", indicator="custom_name"
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(test_custom_name, df_result_custom_name)
 | 
						|
        test_custom_name = df1.merge(
 | 
						|
            df2, on="col1", how="outer", indicator="custom_name"
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(test_custom_name, df_result_custom_name)
 | 
						|
 | 
						|
    def test_merge_indicator_arg_validation(self, dfs_for_indicator):
 | 
						|
        # Check only accepts strings and booleans
 | 
						|
        df1, df2 = dfs_for_indicator
 | 
						|
 | 
						|
        msg = "indicator option can only accept boolean or string arguments"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            merge(df1, df2, on="col1", how="outer", indicator=5)
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            df1.merge(df2, on="col1", how="outer", indicator=5)
 | 
						|
 | 
						|
    def test_merge_indicator_result_integrity(self, dfs_for_indicator):
 | 
						|
        # Check result integrity
 | 
						|
        df1, df2 = dfs_for_indicator
 | 
						|
 | 
						|
        test2 = merge(df1, df2, on="col1", how="left", indicator=True)
 | 
						|
        assert (test2._merge != "right_only").all()
 | 
						|
        test2 = df1.merge(df2, on="col1", how="left", indicator=True)
 | 
						|
        assert (test2._merge != "right_only").all()
 | 
						|
 | 
						|
        test3 = merge(df1, df2, on="col1", how="right", indicator=True)
 | 
						|
        assert (test3._merge != "left_only").all()
 | 
						|
        test3 = df1.merge(df2, on="col1", how="right", indicator=True)
 | 
						|
        assert (test3._merge != "left_only").all()
 | 
						|
 | 
						|
        test4 = merge(df1, df2, on="col1", how="inner", indicator=True)
 | 
						|
        assert (test4._merge == "both").all()
 | 
						|
        test4 = df1.merge(df2, on="col1", how="inner", indicator=True)
 | 
						|
        assert (test4._merge == "both").all()
 | 
						|
 | 
						|
    def test_merge_indicator_invalid(self, dfs_for_indicator):
 | 
						|
        # Check if working name in df
 | 
						|
        df1, _ = dfs_for_indicator
 | 
						|
 | 
						|
        for i in ["_right_indicator", "_left_indicator", "_merge"]:
 | 
						|
            df_badcolumn = DataFrame({"col1": [1, 2], i: [2, 2]})
 | 
						|
 | 
						|
            msg = (
 | 
						|
                "Cannot use `indicator=True` option when data contains a "
 | 
						|
                f"column named {i}|"
 | 
						|
                "Cannot use name of an existing column for indicator column"
 | 
						|
            )
 | 
						|
            with pytest.raises(ValueError, match=msg):
 | 
						|
                merge(df1, df_badcolumn, on="col1", how="outer", indicator=True)
 | 
						|
            with pytest.raises(ValueError, match=msg):
 | 
						|
                df1.merge(df_badcolumn, on="col1", how="outer", indicator=True)
 | 
						|
 | 
						|
        # Check for name conflict with custom name
 | 
						|
        df_badcolumn = DataFrame({"col1": [1, 2], "custom_column_name": [2, 2]})
 | 
						|
 | 
						|
        msg = "Cannot use name of an existing column for indicator column"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            merge(
 | 
						|
                df1,
 | 
						|
                df_badcolumn,
 | 
						|
                on="col1",
 | 
						|
                how="outer",
 | 
						|
                indicator="custom_column_name",
 | 
						|
            )
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            df1.merge(
 | 
						|
                df_badcolumn, on="col1", how="outer", indicator="custom_column_name"
 | 
						|
            )
 | 
						|
 | 
						|
    def test_merge_indicator_multiple_columns(self):
 | 
						|
        # Merge on multiple columns
 | 
						|
        df3 = DataFrame({"col1": [0, 1], "col2": ["a", "b"]})
 | 
						|
 | 
						|
        df4 = DataFrame({"col1": [1, 1, 3], "col2": ["b", "x", "y"]})
 | 
						|
 | 
						|
        hand_coded_result = DataFrame(
 | 
						|
            {"col1": [0, 1, 1, 3], "col2": ["a", "b", "x", "y"]}
 | 
						|
        )
 | 
						|
        hand_coded_result["_merge"] = Categorical(
 | 
						|
            ["left_only", "both", "right_only", "right_only"],
 | 
						|
            categories=["left_only", "right_only", "both"],
 | 
						|
        )
 | 
						|
 | 
						|
        test5 = merge(df3, df4, on=["col1", "col2"], how="outer", indicator=True)
 | 
						|
        tm.assert_frame_equal(test5, hand_coded_result)
 | 
						|
        test5 = df3.merge(df4, on=["col1", "col2"], how="outer", indicator=True)
 | 
						|
        tm.assert_frame_equal(test5, hand_coded_result)
 | 
						|
 | 
						|
    def test_validation(self):
 | 
						|
        left = DataFrame(
 | 
						|
            {"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]},
 | 
						|
            index=range(4),
 | 
						|
        )
 | 
						|
 | 
						|
        right = DataFrame(
 | 
						|
            {
 | 
						|
                "a": ["a", "b", "c", "d", "e"],
 | 
						|
                "c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"],
 | 
						|
            },
 | 
						|
            index=range(5),
 | 
						|
        )
 | 
						|
 | 
						|
        # Make sure no side effects.
 | 
						|
        left_copy = left.copy()
 | 
						|
        right_copy = right.copy()
 | 
						|
 | 
						|
        result = merge(left, right, left_index=True, right_index=True, validate="1:1")
 | 
						|
        tm.assert_frame_equal(left, left_copy)
 | 
						|
        tm.assert_frame_equal(right, right_copy)
 | 
						|
 | 
						|
        # make sure merge still correct
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "a_x": ["a", "b", "c", "d"],
 | 
						|
                "b": ["cat", "dog", "weasel", "horse"],
 | 
						|
                "a_y": ["a", "b", "c", "d"],
 | 
						|
                "c": ["meow", "bark", "um... weasel noise?", "nay"],
 | 
						|
            },
 | 
						|
            index=range(4),
 | 
						|
            columns=["a_x", "b", "a_y", "c"],
 | 
						|
        )
 | 
						|
 | 
						|
        result = merge(
 | 
						|
            left, right, left_index=True, right_index=True, validate="one_to_one"
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        expected_2 = DataFrame(
 | 
						|
            {
 | 
						|
                "a": ["a", "b", "c", "d"],
 | 
						|
                "b": ["cat", "dog", "weasel", "horse"],
 | 
						|
                "c": ["meow", "bark", "um... weasel noise?", "nay"],
 | 
						|
            },
 | 
						|
            index=range(4),
 | 
						|
        )
 | 
						|
 | 
						|
        result = merge(left, right, on="a", validate="1:1")
 | 
						|
        tm.assert_frame_equal(left, left_copy)
 | 
						|
        tm.assert_frame_equal(right, right_copy)
 | 
						|
        tm.assert_frame_equal(result, expected_2)
 | 
						|
 | 
						|
        result = merge(left, right, on="a", validate="one_to_one")
 | 
						|
        tm.assert_frame_equal(result, expected_2)
 | 
						|
 | 
						|
        # One index, one column
 | 
						|
        expected_3 = DataFrame(
 | 
						|
            {
 | 
						|
                "b": ["cat", "dog", "weasel", "horse"],
 | 
						|
                "a": ["a", "b", "c", "d"],
 | 
						|
                "c": ["meow", "bark", "um... weasel noise?", "nay"],
 | 
						|
            },
 | 
						|
            columns=["b", "a", "c"],
 | 
						|
            index=range(4),
 | 
						|
        )
 | 
						|
 | 
						|
        left_index_reset = left.set_index("a")
 | 
						|
        result = merge(
 | 
						|
            left_index_reset,
 | 
						|
            right,
 | 
						|
            left_index=True,
 | 
						|
            right_on="a",
 | 
						|
            validate="one_to_one",
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected_3)
 | 
						|
 | 
						|
        # Dups on right
 | 
						|
        right_w_dups = concat([right, DataFrame({"a": ["e"], "c": ["moo"]}, index=[4])])
 | 
						|
        merge(
 | 
						|
            left,
 | 
						|
            right_w_dups,
 | 
						|
            left_index=True,
 | 
						|
            right_index=True,
 | 
						|
            validate="one_to_many",
 | 
						|
        )
 | 
						|
 | 
						|
        msg = "Merge keys are not unique in right dataset; not a one-to-one merge"
 | 
						|
        with pytest.raises(MergeError, match=msg):
 | 
						|
            merge(
 | 
						|
                left,
 | 
						|
                right_w_dups,
 | 
						|
                left_index=True,
 | 
						|
                right_index=True,
 | 
						|
                validate="one_to_one",
 | 
						|
            )
 | 
						|
 | 
						|
        with pytest.raises(MergeError, match=msg):
 | 
						|
            merge(left, right_w_dups, on="a", validate="one_to_one")
 | 
						|
 | 
						|
        # Dups on left
 | 
						|
        left_w_dups = concat(
 | 
						|
            [left, DataFrame({"a": ["a"], "c": ["cow"]}, index=[3])], sort=True
 | 
						|
        )
 | 
						|
        merge(
 | 
						|
            left_w_dups,
 | 
						|
            right,
 | 
						|
            left_index=True,
 | 
						|
            right_index=True,
 | 
						|
            validate="many_to_one",
 | 
						|
        )
 | 
						|
 | 
						|
        msg = "Merge keys are not unique in left dataset; not a one-to-one merge"
 | 
						|
        with pytest.raises(MergeError, match=msg):
 | 
						|
            merge(
 | 
						|
                left_w_dups,
 | 
						|
                right,
 | 
						|
                left_index=True,
 | 
						|
                right_index=True,
 | 
						|
                validate="one_to_one",
 | 
						|
            )
 | 
						|
 | 
						|
        with pytest.raises(MergeError, match=msg):
 | 
						|
            merge(left_w_dups, right, on="a", validate="one_to_one")
 | 
						|
 | 
						|
        # Dups on both
 | 
						|
        merge(left_w_dups, right_w_dups, on="a", validate="many_to_many")
 | 
						|
 | 
						|
        msg = "Merge keys are not unique in right dataset; not a many-to-one merge"
 | 
						|
        with pytest.raises(MergeError, match=msg):
 | 
						|
            merge(
 | 
						|
                left_w_dups,
 | 
						|
                right_w_dups,
 | 
						|
                left_index=True,
 | 
						|
                right_index=True,
 | 
						|
                validate="many_to_one",
 | 
						|
            )
 | 
						|
 | 
						|
        msg = "Merge keys are not unique in left dataset; not a one-to-many merge"
 | 
						|
        with pytest.raises(MergeError, match=msg):
 | 
						|
            merge(left_w_dups, right_w_dups, on="a", validate="one_to_many")
 | 
						|
 | 
						|
        # Check invalid arguments
 | 
						|
        msg = "Not a valid argument for validate"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            merge(left, right, on="a", validate="jibberish")
 | 
						|
 | 
						|
        # Two column merge, dups in both, but jointly no dups.
 | 
						|
        left = DataFrame(
 | 
						|
            {
 | 
						|
                "a": ["a", "a", "b", "b"],
 | 
						|
                "b": [0, 1, 0, 1],
 | 
						|
                "c": ["cat", "dog", "weasel", "horse"],
 | 
						|
            },
 | 
						|
            index=range(4),
 | 
						|
        )
 | 
						|
 | 
						|
        right = DataFrame(
 | 
						|
            {
 | 
						|
                "a": ["a", "a", "b"],
 | 
						|
                "b": [0, 1, 0],
 | 
						|
                "d": ["meow", "bark", "um... weasel noise?"],
 | 
						|
            },
 | 
						|
            index=range(3),
 | 
						|
        )
 | 
						|
 | 
						|
        expected_multi = DataFrame(
 | 
						|
            {
 | 
						|
                "a": ["a", "a", "b"],
 | 
						|
                "b": [0, 1, 0],
 | 
						|
                "c": ["cat", "dog", "weasel"],
 | 
						|
                "d": ["meow", "bark", "um... weasel noise?"],
 | 
						|
            },
 | 
						|
            index=range(3),
 | 
						|
        )
 | 
						|
 | 
						|
        msg = (
 | 
						|
            "Merge keys are not unique in either left or right dataset; "
 | 
						|
            "not a one-to-one merge"
 | 
						|
        )
 | 
						|
        with pytest.raises(MergeError, match=msg):
 | 
						|
            merge(left, right, on="a", validate="1:1")
 | 
						|
 | 
						|
        result = merge(left, right, on=["a", "b"], validate="1:1")
 | 
						|
        tm.assert_frame_equal(result, expected_multi)
 | 
						|
 | 
						|
    def test_merge_two_empty_df_no_division_error(self):
 | 
						|
        # GH17776, PR #17846
 | 
						|
        a = DataFrame({"a": [], "b": [], "c": []})
 | 
						|
        with np.errstate(divide="raise"):
 | 
						|
            merge(a, a, on=("a", "b"))
 | 
						|
 | 
						|
    @pytest.mark.parametrize("how", ["right", "outer"])
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "index,expected_index",
 | 
						|
        [
 | 
						|
            (
 | 
						|
                CategoricalIndex([1, 2, 4]),
 | 
						|
                CategoricalIndex([1, 2, 4, None, None, None]),
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"]),
 | 
						|
                DatetimeIndex(
 | 
						|
                    ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT]
 | 
						|
                ),
 | 
						|
            ),
 | 
						|
            (Float64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])),
 | 
						|
            (Int64Index([1, 2, 3]), Float64Index([1, 2, 3, None, None, None])),
 | 
						|
            (
 | 
						|
                IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]),
 | 
						|
                IntervalIndex.from_tuples(
 | 
						|
                    [(1, 2), (2, 3), (3, 4), np.nan, np.nan, np.nan]
 | 
						|
                ),
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"),
 | 
						|
                PeriodIndex(
 | 
						|
                    ["2001-01-01", "2001-01-02", "2001-01-03", pd.NaT, pd.NaT, pd.NaT],
 | 
						|
                    freq="D",
 | 
						|
                ),
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                TimedeltaIndex(["1d", "2d", "3d"]),
 | 
						|
                TimedeltaIndex(["1d", "2d", "3d", pd.NaT, pd.NaT, pd.NaT]),
 | 
						|
            ),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merge_on_index_with_more_values(self, how, index, expected_index):
 | 
						|
        # GH 24212
 | 
						|
        # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
 | 
						|
        # -1 is interpreted as a missing value instead of the last element
 | 
						|
        df1 = DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
 | 
						|
        df2 = DataFrame({"b": [0, 1, 2, 3, 4, 5]})
 | 
						|
        result = df1.merge(df2, left_on="key", right_index=True, how=how)
 | 
						|
        expected = DataFrame(
 | 
						|
            [
 | 
						|
                [0, 0, 0],
 | 
						|
                [1, 1, 1],
 | 
						|
                [2, 2, 2],
 | 
						|
                [np.nan, 3, 3],
 | 
						|
                [np.nan, 4, 4],
 | 
						|
                [np.nan, 5, 5],
 | 
						|
            ],
 | 
						|
            columns=["a", "key", "b"],
 | 
						|
        )
 | 
						|
        expected.set_index(expected_index, inplace=True)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_right_index_right(self):
 | 
						|
        # Note: the expected output here is probably incorrect.
 | 
						|
        # See https://github.com/pandas-dev/pandas/issues/17257 for more.
 | 
						|
        # We include this as a regression test for GH-24897.
 | 
						|
        left = DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]})
 | 
						|
        right = DataFrame({"b": [1, 2, 3]})
 | 
						|
 | 
						|
        expected = DataFrame(
 | 
						|
            {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]},
 | 
						|
            columns=["a", "key", "b"],
 | 
						|
            index=[0, 1, 2, np.nan],
 | 
						|
        )
 | 
						|
        result = left.merge(right, left_on="key", right_index=True, how="right")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("how", ["left", "right"])
 | 
						|
    def test_merge_preserves_row_order(self, how):
 | 
						|
        # GH 27453
 | 
						|
        left_df = DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
 | 
						|
        right_df = DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
 | 
						|
        result = left_df.merge(right_df, on=["animal", "max_speed"], how=how)
 | 
						|
        if how == "right":
 | 
						|
            expected = DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
 | 
						|
        else:
 | 
						|
            expected = DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_take_missing_values_from_index_of_other_dtype(self):
 | 
						|
        # GH 24212
 | 
						|
        left = DataFrame(
 | 
						|
            {
 | 
						|
                "a": [1, 2, 3],
 | 
						|
                "key": Categorical(["a", "a", "b"], categories=list("abc")),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        right = DataFrame({"b": [1, 2, 3]}, index=CategoricalIndex(["a", "b", "c"]))
 | 
						|
        result = left.merge(right, left_on="key", right_index=True, how="right")
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "a": [1, 2, 3, None],
 | 
						|
                "key": Categorical(["a", "a", "b", "c"]),
 | 
						|
                "b": [1, 1, 2, 3],
 | 
						|
            },
 | 
						|
            index=[0, 1, 2, np.nan],
 | 
						|
        )
 | 
						|
        expected = expected.reindex(columns=["a", "key", "b"])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_readonly(self):
 | 
						|
        # https://github.com/pandas-dev/pandas/issues/27943
 | 
						|
        data1 = DataFrame(
 | 
						|
            np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
 | 
						|
        )
 | 
						|
        data2 = DataFrame(
 | 
						|
            np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]
 | 
						|
        )
 | 
						|
 | 
						|
        # make each underlying block array / column array read-only
 | 
						|
        for arr in data1._mgr.arrays:
 | 
						|
            arr.flags.writeable = False
 | 
						|
 | 
						|
        data1.merge(data2)  # no error
 | 
						|
 | 
						|
 | 
						|
def _check_merge(x, y):
 | 
						|
    for how in ["inner", "left", "outer"]:
 | 
						|
        result = x.join(y, how=how)
 | 
						|
 | 
						|
        expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True)
 | 
						|
        expected = expected.set_index("index")
 | 
						|
 | 
						|
        # TODO check_names on merge?
 | 
						|
        tm.assert_frame_equal(result, expected, check_names=False)
 | 
						|
 | 
						|
 | 
						|
class TestMergeDtypes:
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "right_vals", [["foo", "bar"], Series(["foo", "bar"]).astype("category")]
 | 
						|
    )
 | 
						|
    def test_different(self, right_vals):
 | 
						|
 | 
						|
        left = DataFrame(
 | 
						|
            {
 | 
						|
                "A": ["foo", "bar"],
 | 
						|
                "B": Series(["foo", "bar"]).astype("category"),
 | 
						|
                "C": [1, 2],
 | 
						|
                "D": [1.0, 2.0],
 | 
						|
                "E": Series([1, 2], dtype="uint64"),
 | 
						|
                "F": Series([1, 2], dtype="int32"),
 | 
						|
            }
 | 
						|
        )
 | 
						|
        right = DataFrame({"A": right_vals})
 | 
						|
 | 
						|
        # GH 9780
 | 
						|
        # We allow merging on object and categorical cols and cast
 | 
						|
        # categorical cols to object
 | 
						|
        result = merge(left, right, on="A")
 | 
						|
        assert is_object_dtype(result.A.dtype)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("d1", [np.int64, np.int32, np.int16, np.int8, np.uint8])
 | 
						|
    @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16])
 | 
						|
    def test_join_multi_dtypes(self, d1, d2):
 | 
						|
 | 
						|
        dtype1 = np.dtype(d1)
 | 
						|
        dtype2 = np.dtype(d2)
 | 
						|
 | 
						|
        left = DataFrame(
 | 
						|
            {
 | 
						|
                "k1": np.array([0, 1, 2] * 8, dtype=dtype1),
 | 
						|
                "k2": ["foo", "bar"] * 12,
 | 
						|
                "v": np.array(np.arange(24), dtype=np.int64),
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
 | 
						|
        right = DataFrame({"v2": np.array([5, 7], dtype=dtype2)}, index=index)
 | 
						|
 | 
						|
        result = left.join(right, on=["k1", "k2"])
 | 
						|
 | 
						|
        expected = left.copy()
 | 
						|
 | 
						|
        if dtype2.kind == "i":
 | 
						|
            dtype2 = np.dtype("float64")
 | 
						|
        expected["v2"] = np.array(np.nan, dtype=dtype2)
 | 
						|
        expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
 | 
						|
        expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
 | 
						|
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = left.join(right, on=["k1", "k2"], sort=True)
 | 
						|
        expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "int_vals, float_vals, exp_vals",
 | 
						|
        [
 | 
						|
            ([1, 2, 3], [1.0, 2.0, 3.0], {"X": [1, 2, 3], "Y": [1.0, 2.0, 3.0]}),
 | 
						|
            ([1, 2, 3], [1.0, 3.0], {"X": [1, 3], "Y": [1.0, 3.0]}),
 | 
						|
            ([1, 2], [1.0, 2.0, 3.0], {"X": [1, 2], "Y": [1.0, 2.0]}),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals):
 | 
						|
        # GH 16572
 | 
						|
        # Check that float column is not cast to object if
 | 
						|
        # merging on float and int columns
 | 
						|
        A = DataFrame({"X": int_vals})
 | 
						|
        B = DataFrame({"Y": float_vals})
 | 
						|
        expected = DataFrame(exp_vals)
 | 
						|
 | 
						|
        result = A.merge(B, left_on="X", right_on="Y")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        result = B.merge(A, left_on="Y", right_on="X")
 | 
						|
        tm.assert_frame_equal(result, expected[["Y", "X"]])
 | 
						|
 | 
						|
    def test_merge_key_dtype_cast(self):
 | 
						|
        # GH 17044
 | 
						|
        df1 = DataFrame({"key": [1.0, 2.0], "v1": [10, 20]}, columns=["key", "v1"])
 | 
						|
        df2 = DataFrame({"key": [2], "v2": [200]}, columns=["key", "v2"])
 | 
						|
        result = df1.merge(df2, on="key", how="left")
 | 
						|
        expected = DataFrame(
 | 
						|
            {"key": [1.0, 2.0], "v1": [10, 20], "v2": [np.nan, 200.0]},
 | 
						|
            columns=["key", "v1", "v2"],
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_on_ints_floats_warning(self):
 | 
						|
        # GH 16572
 | 
						|
        # merge will produce a warning when merging on int and
 | 
						|
        # float columns where the float values are not exactly
 | 
						|
        # equal to their int representation
 | 
						|
        A = DataFrame({"X": [1, 2, 3]})
 | 
						|
        B = DataFrame({"Y": [1.1, 2.5, 3.0]})
 | 
						|
        expected = DataFrame({"X": [3], "Y": [3.0]})
 | 
						|
 | 
						|
        with tm.assert_produces_warning(UserWarning):
 | 
						|
            result = A.merge(B, left_on="X", right_on="Y")
 | 
						|
            tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        with tm.assert_produces_warning(UserWarning):
 | 
						|
            result = B.merge(A, left_on="Y", right_on="X")
 | 
						|
            tm.assert_frame_equal(result, expected[["Y", "X"]])
 | 
						|
 | 
						|
        # test no warning if float has NaNs
 | 
						|
        B = DataFrame({"Y": [np.nan, np.nan, 3.0]})
 | 
						|
 | 
						|
        with tm.assert_produces_warning(None):
 | 
						|
            result = B.merge(A, left_on="Y", right_on="X")
 | 
						|
            tm.assert_frame_equal(result, expected[["Y", "X"]])
 | 
						|
 | 
						|
    def test_merge_incompat_infer_boolean_object(self):
 | 
						|
        # GH21119: bool + object bool merge OK
 | 
						|
        df1 = DataFrame({"key": Series([True, False], dtype=object)})
 | 
						|
        df2 = DataFrame({"key": [True, False]})
 | 
						|
 | 
						|
        expected = DataFrame({"key": [True, False]}, dtype=object)
 | 
						|
        result = merge(df1, df2, on="key")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
        result = merge(df2, df1, on="key")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_incompat_infer_boolean_object_with_missing(self):
 | 
						|
        # GH21119: bool + object bool merge OK
 | 
						|
        # with missing value
 | 
						|
        df1 = DataFrame({"key": Series([True, False, np.nan], dtype=object)})
 | 
						|
        df2 = DataFrame({"key": [True, False]})
 | 
						|
 | 
						|
        expected = DataFrame({"key": [True, False]}, dtype=object)
 | 
						|
        result = merge(df1, df2, on="key")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
        result = merge(df2, df1, on="key")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "df1_vals, df2_vals",
 | 
						|
        [
 | 
						|
            # merge on category coerces to object
 | 
						|
            ([0, 1, 2], Series(["a", "b", "a"]).astype("category")),
 | 
						|
            ([0.0, 1.0, 2.0], Series(["a", "b", "a"]).astype("category")),
 | 
						|
            # no not infer
 | 
						|
            ([0, 1], Series([False, True], dtype=object)),
 | 
						|
            ([0, 1], Series([False, True], dtype=bool)),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
 | 
						|
        # these are explicitly allowed incompat merges, that pass thru
 | 
						|
        # the result type is dependent on if the values on the rhs are
 | 
						|
        # inferred, otherwise these will be coerced to object
 | 
						|
 | 
						|
        df1 = DataFrame({"A": df1_vals})
 | 
						|
        df2 = DataFrame({"A": df2_vals})
 | 
						|
 | 
						|
        result = merge(df1, df2, on=["A"])
 | 
						|
        assert is_object_dtype(result.A.dtype)
 | 
						|
        result = merge(df2, df1, on=["A"])
 | 
						|
        assert is_object_dtype(result.A.dtype)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "df1_vals, df2_vals",
 | 
						|
        [
 | 
						|
            # do not infer to numeric
 | 
						|
            (Series([1, 2], dtype="uint64"), ["a", "b", "c"]),
 | 
						|
            (Series([1, 2], dtype="int32"), ["a", "b", "c"]),
 | 
						|
            ([0, 1, 2], ["0", "1", "2"]),
 | 
						|
            ([0.0, 1.0, 2.0], ["0", "1", "2"]),
 | 
						|
            ([0, 1, 2], ["0", "1", "2"]),
 | 
						|
            (
 | 
						|
                pd.date_range("1/1/2011", periods=2, freq="D"),
 | 
						|
                ["2011-01-01", "2011-01-02"],
 | 
						|
            ),
 | 
						|
            (pd.date_range("1/1/2011", periods=2, freq="D"), [0, 1]),
 | 
						|
            (pd.date_range("1/1/2011", periods=2, freq="D"), [0.0, 1.0]),
 | 
						|
            (
 | 
						|
                pd.date_range("20130101", periods=3),
 | 
						|
                pd.date_range("20130101", periods=3, tz="US/Eastern"),
 | 
						|
            ),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
 | 
						|
        # GH 9780, GH 15800
 | 
						|
        # Raise a ValueError when a user tries to merge on
 | 
						|
        # dtypes that are incompatible (e.g., obj and int/float)
 | 
						|
 | 
						|
        df1 = DataFrame({"A": df1_vals})
 | 
						|
        df2 = DataFrame({"A": df2_vals})
 | 
						|
 | 
						|
        msg = (
 | 
						|
            f"You are trying to merge on {df1['A'].dtype} and "
 | 
						|
            f"{df2['A'].dtype} columns. If you wish to proceed "
 | 
						|
            "you should use pd.concat"
 | 
						|
        )
 | 
						|
        msg = re.escape(msg)
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            merge(df1, df2, on=["A"])
 | 
						|
 | 
						|
        # Check that error still raised when swapping order of dataframes
 | 
						|
        msg = (
 | 
						|
            f"You are trying to merge on {df2['A'].dtype} and "
 | 
						|
            f"{df1['A'].dtype} columns. If you wish to proceed "
 | 
						|
            "you should use pd.concat"
 | 
						|
        )
 | 
						|
        msg = re.escape(msg)
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            merge(df2, df1, on=["A"])
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "expected_data, how",
 | 
						|
        [
 | 
						|
            ([1, 2], "outer"),
 | 
						|
            ([], "inner"),
 | 
						|
            ([2], "right"),
 | 
						|
            ([1], "left"),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merge_EA_dtype(self, any_numeric_ea_dtype, how, expected_data):
 | 
						|
        # GH#40073
 | 
						|
        d1 = DataFrame([(1,)], columns=["id"], dtype=any_numeric_ea_dtype)
 | 
						|
        d2 = DataFrame([(2,)], columns=["id"], dtype=any_numeric_ea_dtype)
 | 
						|
        result = merge(d1, d2, how=how)
 | 
						|
        expected = DataFrame(expected_data, columns=["id"], dtype=any_numeric_ea_dtype)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "expected_data, how",
 | 
						|
        [
 | 
						|
            (["a", "b"], "outer"),
 | 
						|
            ([], "inner"),
 | 
						|
            (["b"], "right"),
 | 
						|
            (["a"], "left"),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merge_string_dtype(self, how, expected_data, any_string_dtype):
 | 
						|
        # GH#40073
 | 
						|
        d1 = DataFrame([("a",)], columns=["id"], dtype=any_string_dtype)
 | 
						|
        d2 = DataFrame([("b",)], columns=["id"], dtype=any_string_dtype)
 | 
						|
        result = merge(d1, d2, how=how)
 | 
						|
        expected = DataFrame(expected_data, columns=["id"], dtype=any_string_dtype)
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "how, expected_data",
 | 
						|
        [
 | 
						|
            ("inner", [[True, 1, 4], [False, 5, 3]]),
 | 
						|
            ("outer", [[True, 1, 4], [False, 5, 3]]),
 | 
						|
            ("left", [[True, 1, 4], [False, 5, 3]]),
 | 
						|
            ("right", [[False, 5, 3], [True, 1, 4]]),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merge_bool_dtype(self, how, expected_data):
 | 
						|
        # GH#40073
 | 
						|
        df1 = DataFrame({"A": [True, False], "B": [1, 5]})
 | 
						|
        df2 = DataFrame({"A": [False, True], "C": [3, 4]})
 | 
						|
        result = merge(df1, df2, how=how)
 | 
						|
        expected = DataFrame(expected_data, columns=["A", "B", "C"])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_ea_with_string(self, join_type, string_dtype):
 | 
						|
        # GH 43734 Avoid the use of `assign` with multi-index
 | 
						|
        df1 = DataFrame(
 | 
						|
            data={
 | 
						|
                ("lvl0", "lvl1-a"): ["1", "2", "3", "4", None],
 | 
						|
                ("lvl0", "lvl1-b"): ["4", "5", "6", "7", "8"],
 | 
						|
            },
 | 
						|
            dtype=pd.StringDtype(),
 | 
						|
        )
 | 
						|
        df1_copy = df1.copy()
 | 
						|
        df2 = DataFrame(
 | 
						|
            data={
 | 
						|
                ("lvl0", "lvl1-a"): ["1", "2", "3", pd.NA, "5"],
 | 
						|
                ("lvl0", "lvl1-c"): ["7", "8", "9", pd.NA, "11"],
 | 
						|
            },
 | 
						|
            dtype=string_dtype,
 | 
						|
        )
 | 
						|
        df2_copy = df2.copy()
 | 
						|
        merged = merge(left=df1, right=df2, on=[("lvl0", "lvl1-a")], how=join_type)
 | 
						|
 | 
						|
        # No change in df1 and df2
 | 
						|
        tm.assert_frame_equal(df1, df1_copy)
 | 
						|
        tm.assert_frame_equal(df2, df2_copy)
 | 
						|
 | 
						|
        # Check the expected types for the merged data frame
 | 
						|
        expected = Series(
 | 
						|
            [np.dtype("O"), pd.StringDtype(), np.dtype("O")],
 | 
						|
            index=MultiIndex.from_tuples(
 | 
						|
                [("lvl0", "lvl1-a"), ("lvl0", "lvl1-b"), ("lvl0", "lvl1-c")]
 | 
						|
            ),
 | 
						|
        )
 | 
						|
        tm.assert_series_equal(merged.dtypes, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def left():
 | 
						|
    np.random.seed(1234)
 | 
						|
    return DataFrame(
 | 
						|
        {
 | 
						|
            "X": Series(np.random.choice(["foo", "bar"], size=(10,))).astype(
 | 
						|
                CDT(["foo", "bar"])
 | 
						|
            ),
 | 
						|
            "Y": np.random.choice(["one", "two", "three"], size=(10,)),
 | 
						|
        }
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def right():
 | 
						|
    np.random.seed(1234)
 | 
						|
    return DataFrame(
 | 
						|
        {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]}
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
class TestMergeCategorical:
 | 
						|
    def test_identical(self, left):
 | 
						|
        # merging on the same, should preserve dtypes
 | 
						|
        merged = merge(left, left, on="X")
 | 
						|
        result = merged.dtypes.sort_index()
 | 
						|
        expected = Series(
 | 
						|
            [CategoricalDtype(categories=["foo", "bar"]), np.dtype("O"), np.dtype("O")],
 | 
						|
            index=["X", "Y_x", "Y_y"],
 | 
						|
        )
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    def test_basic(self, left, right):
 | 
						|
        # we have matching Categorical dtypes in X
 | 
						|
        # so should preserve the merged column
 | 
						|
        merged = merge(left, right, on="X")
 | 
						|
        result = merged.dtypes.sort_index()
 | 
						|
        expected = Series(
 | 
						|
            [
 | 
						|
                CategoricalDtype(categories=["foo", "bar"]),
 | 
						|
                np.dtype("O"),
 | 
						|
                np.dtype("int64"),
 | 
						|
            ],
 | 
						|
            index=["X", "Y", "Z"],
 | 
						|
        )
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    def test_merge_categorical(self):
 | 
						|
        # GH 9426
 | 
						|
 | 
						|
        right = DataFrame(
 | 
						|
            {
 | 
						|
                "c": {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"},
 | 
						|
                "d": {0: "null", 1: "null", 2: "null", 3: "null", 4: "null"},
 | 
						|
            }
 | 
						|
        )
 | 
						|
        left = DataFrame(
 | 
						|
            {
 | 
						|
                "a": {0: "f", 1: "f", 2: "f", 3: "f", 4: "f"},
 | 
						|
                "b": {0: "g", 1: "g", 2: "g", 3: "g", 4: "g"},
 | 
						|
            }
 | 
						|
        )
 | 
						|
        df = merge(left, right, how="left", left_on="b", right_on="c")
 | 
						|
 | 
						|
        # object-object
 | 
						|
        expected = df.copy()
 | 
						|
 | 
						|
        # object-cat
 | 
						|
        # note that we propagate the category
 | 
						|
        # because we don't have any matching rows
 | 
						|
        cright = right.copy()
 | 
						|
        cright["d"] = cright["d"].astype("category")
 | 
						|
        result = merge(left, cright, how="left", left_on="b", right_on="c")
 | 
						|
        expected["d"] = expected["d"].astype(CategoricalDtype(["null"]))
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # cat-object
 | 
						|
        cleft = left.copy()
 | 
						|
        cleft["b"] = cleft["b"].astype("category")
 | 
						|
        result = merge(cleft, cright, how="left", left_on="b", right_on="c")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
        # cat-cat
 | 
						|
        cright = right.copy()
 | 
						|
        cright["d"] = cright["d"].astype("category")
 | 
						|
        cleft = left.copy()
 | 
						|
        cleft["b"] = cleft["b"].astype("category")
 | 
						|
        result = merge(cleft, cright, how="left", left_on="b", right_on="c")
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def tests_merge_categorical_unordered_equal(self):
 | 
						|
        # GH-19551
 | 
						|
        df1 = DataFrame(
 | 
						|
            {
 | 
						|
                "Foo": Categorical(["A", "B", "C"], categories=["A", "B", "C"]),
 | 
						|
                "Left": ["A0", "B0", "C0"],
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        df2 = DataFrame(
 | 
						|
            {
 | 
						|
                "Foo": Categorical(["C", "B", "A"], categories=["C", "B", "A"]),
 | 
						|
                "Right": ["C1", "B1", "A1"],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        result = merge(df1, df2, on=["Foo"])
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "Foo": Categorical(["A", "B", "C"]),
 | 
						|
                "Left": ["A0", "B0", "C0"],
 | 
						|
                "Right": ["A1", "B1", "C1"],
 | 
						|
            }
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("ordered", [True, False])
 | 
						|
    def test_multiindex_merge_with_unordered_categoricalindex(self, ordered):
 | 
						|
        # GH 36973
 | 
						|
        pcat = CategoricalDtype(categories=["P2", "P1"], ordered=ordered)
 | 
						|
        df1 = DataFrame(
 | 
						|
            {
 | 
						|
                "id": ["C", "C", "D"],
 | 
						|
                "p": Categorical(["P2", "P1", "P2"], dtype=pcat),
 | 
						|
                "a": [0, 1, 2],
 | 
						|
            }
 | 
						|
        ).set_index(["id", "p"])
 | 
						|
        df2 = DataFrame(
 | 
						|
            {
 | 
						|
                "id": ["A", "C", "C"],
 | 
						|
                "p": Categorical(["P2", "P2", "P1"], dtype=pcat),
 | 
						|
                "d1": [10, 11, 12],
 | 
						|
            }
 | 
						|
        ).set_index(["id", "p"])
 | 
						|
        result = merge(df1, df2, how="left", left_index=True, right_index=True)
 | 
						|
        expected = DataFrame(
 | 
						|
            {
 | 
						|
                "id": ["C", "C", "D"],
 | 
						|
                "p": Categorical(["P2", "P1", "P2"], dtype=pcat),
 | 
						|
                "a": [0, 1, 2],
 | 
						|
                "d1": [11.0, 12.0, np.nan],
 | 
						|
            }
 | 
						|
        ).set_index(["id", "p"])
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    def test_other_columns(self, left, right):
 | 
						|
        # non-merge columns should preserve if possible
 | 
						|
        right = right.assign(Z=right.Z.astype("category"))
 | 
						|
 | 
						|
        merged = merge(left, right, on="X")
 | 
						|
        result = merged.dtypes.sort_index()
 | 
						|
        expected = Series(
 | 
						|
            [
 | 
						|
                CategoricalDtype(categories=["foo", "bar"]),
 | 
						|
                np.dtype("O"),
 | 
						|
                CategoricalDtype(categories=[1, 2]),
 | 
						|
            ],
 | 
						|
            index=["X", "Y", "Z"],
 | 
						|
        )
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
        # categories are preserved
 | 
						|
        assert left.X.values._categories_match_up_to_permutation(merged.X.values)
 | 
						|
        assert right.Z.values._categories_match_up_to_permutation(merged.Z.values)
 | 
						|
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "change",
 | 
						|
        [
 | 
						|
            lambda x: x,
 | 
						|
            lambda x: x.astype(CDT(["foo", "bar", "bah"])),
 | 
						|
            lambda x: x.astype(CDT(ordered=True)),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_dtype_on_merged_different(self, change, join_type, left, right):
 | 
						|
        # our merging columns, X now has 2 different dtypes
 | 
						|
        # so we must be object as a result
 | 
						|
 | 
						|
        X = change(right.X.astype("object"))
 | 
						|
        right = right.assign(X=X)
 | 
						|
        assert is_categorical_dtype(left.X.values.dtype)
 | 
						|
        # assert not left.X.values._categories_match_up_to_permutation(right.X.values)
 | 
						|
 | 
						|
        merged = merge(left, right, on="X", how=join_type)
 | 
						|
 | 
						|
        result = merged.dtypes.sort_index()
 | 
						|
        expected = Series(
 | 
						|
            [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"]
 | 
						|
        )
 | 
						|
        tm.assert_series_equal(result, expected)
 | 
						|
 | 
						|
    def test_self_join_multiple_categories(self):
 | 
						|
        # GH 16767
 | 
						|
        # non-duplicates should work with multiple categories
 | 
						|
        m = 5
 | 
						|
        df = DataFrame(
 | 
						|
            {
 | 
						|
                "a": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] * m,
 | 
						|
                "b": ["t", "w", "x", "y", "z"] * 2 * m,
 | 
						|
                "c": [
 | 
						|
                    letter
 | 
						|
                    for each in ["m", "n", "u", "p", "o"]
 | 
						|
                    for letter in [each] * 2 * m
 | 
						|
                ],
 | 
						|
                "d": [
 | 
						|
                    letter
 | 
						|
                    for each in [
 | 
						|
                        "aa",
 | 
						|
                        "bb",
 | 
						|
                        "cc",
 | 
						|
                        "dd",
 | 
						|
                        "ee",
 | 
						|
                        "ff",
 | 
						|
                        "gg",
 | 
						|
                        "hh",
 | 
						|
                        "ii",
 | 
						|
                        "jj",
 | 
						|
                    ]
 | 
						|
                    for letter in [each] * m
 | 
						|
                ],
 | 
						|
            }
 | 
						|
        )
 | 
						|
 | 
						|
        # change them all to categorical variables
 | 
						|
        df = df.apply(lambda x: x.astype("category"))
 | 
						|
 | 
						|
        # self-join should equal ourselves
 | 
						|
        result = merge(df, df, on=list(df.columns))
 | 
						|
 | 
						|
        tm.assert_frame_equal(result, df)
 | 
						|
 | 
						|
    def test_dtype_on_categorical_dates(self):
 | 
						|
        # GH 16900
 | 
						|
        # dates should not be coerced to ints
 | 
						|
 | 
						|
        df = DataFrame(
 | 
						|
            [[date(2001, 1, 1), 1.1], [date(2001, 1, 2), 1.3]], columns=["date", "num2"]
 | 
						|
        )
 | 
						|
        df["date"] = df["date"].astype("category")
 | 
						|
 | 
						|
        df2 = DataFrame(
 | 
						|
            [[date(2001, 1, 1), 1.3], [date(2001, 1, 3), 1.4]], columns=["date", "num4"]
 | 
						|
        )
 | 
						|
        df2["date"] = df2["date"].astype("category")
 | 
						|
 | 
						|
        expected_outer = DataFrame(
 | 
						|
            [
 | 
						|
                [pd.Timestamp("2001-01-01").date(), 1.1, 1.3],
 | 
						|
                [pd.Timestamp("2001-01-02").date(), 1.3, np.nan],
 | 
						|
                [pd.Timestamp("2001-01-03").date(), np.nan, 1.4],
 | 
						|
            ],
 | 
						|
            columns=["date", "num2", "num4"],
 | 
						|
        )
 | 
						|
        result_outer = merge(df, df2, how="outer", on=["date"])
 | 
						|
        tm.assert_frame_equal(result_outer, expected_outer)
 | 
						|
 | 
						|
        expected_inner = DataFrame(
 | 
						|
            [[pd.Timestamp("2001-01-01").date(), 1.1, 1.3]],
 | 
						|
            columns=["date", "num2", "num4"],
 | 
						|
        )
 | 
						|
        result_inner = merge(df, df2, how="inner", on=["date"])
 | 
						|
        tm.assert_frame_equal(result_inner, expected_inner)
 | 
						|
 | 
						|
    @pytest.mark.parametrize("ordered", [True, False])
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "category_column,categories,expected_categories",
 | 
						|
        [
 | 
						|
            ([False, True, True, False], [True, False], [True, False]),
 | 
						|
            ([2, 1, 1, 2], [1, 2], [1, 2]),
 | 
						|
            (["False", "True", "True", "False"], ["True", "False"], ["True", "False"]),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merging_with_bool_or_int_cateorical_column(
 | 
						|
        self, category_column, categories, expected_categories, ordered
 | 
						|
    ):
 | 
						|
        # GH 17187
 | 
						|
        # merging with a boolean/int categorical column
 | 
						|
        df1 = DataFrame({"id": [1, 2, 3, 4], "cat": category_column})
 | 
						|
        df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered))
 | 
						|
        df2 = DataFrame({"id": [2, 4], "num": [1, 9]})
 | 
						|
        result = df1.merge(df2)
 | 
						|
        expected = DataFrame({"id": [2, 4], "cat": expected_categories, "num": [1, 9]})
 | 
						|
        expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered))
 | 
						|
        tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
    def test_merge_on_int_array(self):
 | 
						|
        # GH 23020
 | 
						|
        df = DataFrame({"A": Series([1, 2, np.nan], dtype="Int64"), "B": 1})
 | 
						|
        result = merge(df, df, on="A")
 | 
						|
        expected = DataFrame(
 | 
						|
            {"A": Series([1, 2, np.nan], dtype="Int64"), "B_x": 1, "B_y": 1}
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def left_df():
 | 
						|
    return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0])
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def right_df():
 | 
						|
    return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2])
 | 
						|
 | 
						|
 | 
						|
class TestMergeOnIndexes:
 | 
						|
    @pytest.mark.parametrize(
 | 
						|
        "how, sort, expected",
 | 
						|
        [
 | 
						|
            ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])),
 | 
						|
            ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])),
 | 
						|
            (
 | 
						|
                "left",
 | 
						|
                False,
 | 
						|
                DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]),
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                "left",
 | 
						|
                True,
 | 
						|
                DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]),
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                "right",
 | 
						|
                False,
 | 
						|
                DataFrame(
 | 
						|
                    {"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]
 | 
						|
                ),
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                "right",
 | 
						|
                True,
 | 
						|
                DataFrame(
 | 
						|
                    {"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]
 | 
						|
                ),
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                "outer",
 | 
						|
                False,
 | 
						|
                DataFrame(
 | 
						|
                    {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
 | 
						|
                    index=[0, 1, 2, 3],
 | 
						|
                ),
 | 
						|
            ),
 | 
						|
            (
 | 
						|
                "outer",
 | 
						|
                True,
 | 
						|
                DataFrame(
 | 
						|
                    {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
 | 
						|
                    index=[0, 1, 2, 3],
 | 
						|
                ),
 | 
						|
            ),
 | 
						|
        ],
 | 
						|
    )
 | 
						|
    def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
 | 
						|
        result = merge(
 | 
						|
            left_df, right_df, left_index=True, right_index=True, how=how, sort=sort
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "index",
 | 
						|
    [
 | 
						|
        CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"),
 | 
						|
        Float64Index([1.0, 2.0], name="index_col"),
 | 
						|
        Int64Index([1, 2], name="index_col"),
 | 
						|
        UInt64Index([1, 2], name="index_col"),
 | 
						|
        RangeIndex(start=0, stop=2, name="index_col"),
 | 
						|
        DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"),
 | 
						|
    ],
 | 
						|
    ids=lambda x: type(x).__name__,
 | 
						|
)
 | 
						|
def test_merge_index_types(index):
 | 
						|
    # gh-20777
 | 
						|
    # assert key access is consistent across index types
 | 
						|
    left = DataFrame({"left_data": [1, 2]}, index=index)
 | 
						|
    right = DataFrame({"right_data": [1.0, 2.0]}, index=index)
 | 
						|
 | 
						|
    result = left.merge(right, on=["index_col"])
 | 
						|
 | 
						|
    expected = DataFrame({"left_data": [1, 2], "right_data": [1.0, 2.0]}, index=index)
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "on,left_on,right_on,left_index,right_index,nm",
 | 
						|
    [
 | 
						|
        (["outer", "inner"], None, None, False, False, "B"),
 | 
						|
        (None, None, None, True, True, "B"),
 | 
						|
        (None, ["outer", "inner"], None, False, True, "B"),
 | 
						|
        (None, None, ["outer", "inner"], True, False, "B"),
 | 
						|
        (["outer", "inner"], None, None, False, False, None),
 | 
						|
        (None, None, None, True, True, None),
 | 
						|
        (None, ["outer", "inner"], None, False, True, None),
 | 
						|
        (None, None, ["outer", "inner"], True, False, None),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
 | 
						|
    # GH 21220
 | 
						|
    a = DataFrame(
 | 
						|
        {"A": [1, 2, 3, 4]},
 | 
						|
        index=MultiIndex.from_product([["a", "b"], [0, 1]], names=["outer", "inner"]),
 | 
						|
    )
 | 
						|
    b = Series(
 | 
						|
        [1, 2, 3, 4],
 | 
						|
        index=MultiIndex.from_product([["a", "b"], [1, 2]], names=["outer", "inner"]),
 | 
						|
        name=nm,
 | 
						|
    )
 | 
						|
    expected = DataFrame(
 | 
						|
        {"A": [2, 4], "B": [1, 3]},
 | 
						|
        index=MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]),
 | 
						|
    )
 | 
						|
    if nm is not None:
 | 
						|
        result = merge(
 | 
						|
            a,
 | 
						|
            b,
 | 
						|
            on=on,
 | 
						|
            left_on=left_on,
 | 
						|
            right_on=right_on,
 | 
						|
            left_index=left_index,
 | 
						|
            right_index=right_index,
 | 
						|
        )
 | 
						|
        tm.assert_frame_equal(result, expected)
 | 
						|
    else:
 | 
						|
        msg = "Cannot merge a Series without a name"
 | 
						|
        with pytest.raises(ValueError, match=msg):
 | 
						|
            result = merge(
 | 
						|
                a,
 | 
						|
                b,
 | 
						|
                on=on,
 | 
						|
                left_on=left_on,
 | 
						|
                right_on=right_on,
 | 
						|
                left_index=left_index,
 | 
						|
                right_index=right_index,
 | 
						|
            )
 | 
						|
 | 
						|
 | 
						|
def test_merge_series_multilevel():
 | 
						|
    # GH#47946
 | 
						|
    a = DataFrame(
 | 
						|
        {"A": [1, 2, 3, 4]},
 | 
						|
        index=MultiIndex.from_product([["a", "b"], [0, 1]], names=["outer", "inner"]),
 | 
						|
    )
 | 
						|
    b = Series(
 | 
						|
        [1, 2, 3, 4],
 | 
						|
        index=MultiIndex.from_product([["a", "b"], [1, 2]], names=["outer", "inner"]),
 | 
						|
        name=("B", "C"),
 | 
						|
    )
 | 
						|
    expected = DataFrame(
 | 
						|
        {"A": [2, 4], ("B", "C"): [1, 3]},
 | 
						|
        index=MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]),
 | 
						|
    )
 | 
						|
    with tm.assert_produces_warning(FutureWarning):
 | 
						|
        result = merge(a, b, on=["outer", "inner"])
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "col1, col2, kwargs, expected_cols",
 | 
						|
    [
 | 
						|
        (0, 0, {"suffixes": ("", "_dup")}, ["0", "0_dup"]),
 | 
						|
        (0, 0, {"suffixes": (None, "_dup")}, [0, "0_dup"]),
 | 
						|
        (0, 0, {"suffixes": ("_x", "_y")}, ["0_x", "0_y"]),
 | 
						|
        (0, 0, {"suffixes": ["_x", "_y"]}, ["0_x", "0_y"]),
 | 
						|
        ("a", 0, {"suffixes": (None, "_y")}, ["a", 0]),
 | 
						|
        (0.0, 0.0, {"suffixes": ("_x", None)}, ["0.0_x", 0.0]),
 | 
						|
        ("b", "b", {"suffixes": (None, "_y")}, ["b", "b_y"]),
 | 
						|
        ("a", "a", {"suffixes": ("_x", None)}, ["a_x", "a"]),
 | 
						|
        ("a", "b", {"suffixes": ("_x", None)}, ["a", "b"]),
 | 
						|
        ("a", "a", {"suffixes": (None, "_x")}, ["a", "a_x"]),
 | 
						|
        (0, 0, {"suffixes": ("_a", None)}, ["0_a", 0]),
 | 
						|
        ("a", "a", {}, ["a_x", "a_y"]),
 | 
						|
        (0, 0, {}, ["0_x", "0_y"]),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_merge_suffix(col1, col2, kwargs, expected_cols):
 | 
						|
    # issue: 24782
 | 
						|
    a = DataFrame({col1: [1, 2, 3]})
 | 
						|
    b = DataFrame({col2: [4, 5, 6]})
 | 
						|
 | 
						|
    expected = DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols)
 | 
						|
 | 
						|
    result = a.merge(b, left_index=True, right_index=True, **kwargs)
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    result = merge(a, b, left_index=True, right_index=True, **kwargs)
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "how,expected",
 | 
						|
    [
 | 
						|
        (
 | 
						|
            "right",
 | 
						|
            DataFrame(
 | 
						|
                {"A": [100, 200, 300], "B1": [60, 70, np.nan], "B2": [600, 700, 800]}
 | 
						|
            ),
 | 
						|
        ),
 | 
						|
        (
 | 
						|
            "outer",
 | 
						|
            DataFrame(
 | 
						|
                {
 | 
						|
                    "A": [100, 200, 1, 300],
 | 
						|
                    "B1": [60, 70, 80, np.nan],
 | 
						|
                    "B2": [600, 700, np.nan, 800],
 | 
						|
                }
 | 
						|
            ),
 | 
						|
        ),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_merge_duplicate_suffix(how, expected):
 | 
						|
    left_df = DataFrame({"A": [100, 200, 1], "B": [60, 70, 80]})
 | 
						|
    right_df = DataFrame({"A": [100, 200, 300], "B": [600, 700, 800]})
 | 
						|
    result = merge(left_df, right_df, on="A", how=how, suffixes=("_x", "_x"))
 | 
						|
    expected.columns = ["A", "B_x", "B_x"]
 | 
						|
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "col1, col2, suffixes",
 | 
						|
    [("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))],
 | 
						|
)
 | 
						|
def test_merge_suffix_error(col1, col2, suffixes):
 | 
						|
    # issue: 24782
 | 
						|
    a = DataFrame({col1: [1, 2, 3]})
 | 
						|
    b = DataFrame({col2: [3, 4, 5]})
 | 
						|
 | 
						|
    # TODO: might reconsider current raise behaviour, see issue 24782
 | 
						|
    msg = "columns overlap but no suffix specified"
 | 
						|
    with pytest.raises(ValueError, match=msg):
 | 
						|
        merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}])
 | 
						|
def test_merge_suffix_warns(suffixes):
 | 
						|
    a = DataFrame({"a": [1, 2, 3]})
 | 
						|
    b = DataFrame({"b": [3, 4, 5]})
 | 
						|
 | 
						|
    with tm.assert_produces_warning(FutureWarning):
 | 
						|
        merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"})
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "col1, col2, suffixes, msg",
 | 
						|
    [
 | 
						|
        ("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"),
 | 
						|
        ("a", "a", tuple("a"), r"not enough values to unpack \(expected 2, got 1\)"),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_merge_suffix_length_error(col1, col2, suffixes, msg):
 | 
						|
    a = DataFrame({col1: [1, 2, 3]})
 | 
						|
    b = DataFrame({col2: [3, 4, 5]})
 | 
						|
 | 
						|
    with pytest.raises(ValueError, match=msg):
 | 
						|
        merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("cat_dtype", ["one", "two"])
 | 
						|
@pytest.mark.parametrize("reverse", [True, False])
 | 
						|
def test_merge_equal_cat_dtypes(cat_dtype, reverse):
 | 
						|
    # see gh-22501
 | 
						|
    cat_dtypes = {
 | 
						|
        "one": CategoricalDtype(categories=["a", "b", "c"], ordered=False),
 | 
						|
        "two": CategoricalDtype(categories=["a", "b", "c"], ordered=False),
 | 
						|
    }
 | 
						|
 | 
						|
    df1 = DataFrame(
 | 
						|
        {"foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), "left": [1, 2, 3]}
 | 
						|
    ).set_index("foo")
 | 
						|
 | 
						|
    data_foo = ["a", "b", "c"]
 | 
						|
    data_right = [1, 2, 3]
 | 
						|
 | 
						|
    if reverse:
 | 
						|
        data_foo.reverse()
 | 
						|
        data_right.reverse()
 | 
						|
 | 
						|
    df2 = DataFrame(
 | 
						|
        {"foo": Series(data_foo).astype(cat_dtypes[cat_dtype]), "right": data_right}
 | 
						|
    ).set_index("foo")
 | 
						|
 | 
						|
    result = df1.merge(df2, left_index=True, right_index=True)
 | 
						|
 | 
						|
    expected = DataFrame(
 | 
						|
        {
 | 
						|
            "left": [1, 2, 3],
 | 
						|
            "right": [1, 2, 3],
 | 
						|
            "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]),
 | 
						|
        }
 | 
						|
    ).set_index("foo")
 | 
						|
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_merge_equal_cat_dtypes2():
 | 
						|
    # see gh-22501
 | 
						|
    cat_dtype = CategoricalDtype(categories=["a", "b", "c"], ordered=False)
 | 
						|
 | 
						|
    # Test Data
 | 
						|
    df1 = DataFrame(
 | 
						|
        {"foo": Series(["a", "b"]).astype(cat_dtype), "left": [1, 2]}
 | 
						|
    ).set_index("foo")
 | 
						|
 | 
						|
    df2 = DataFrame(
 | 
						|
        {"foo": Series(["a", "b", "c"]).astype(cat_dtype), "right": [3, 2, 1]}
 | 
						|
    ).set_index("foo")
 | 
						|
 | 
						|
    result = df1.merge(df2, left_index=True, right_index=True)
 | 
						|
 | 
						|
    expected = DataFrame(
 | 
						|
        {"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)}
 | 
						|
    ).set_index("foo")
 | 
						|
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_merge_on_cat_and_ext_array():
 | 
						|
    # GH 28668
 | 
						|
    right = DataFrame(
 | 
						|
        {"a": Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval")}
 | 
						|
    )
 | 
						|
    left = right.copy()
 | 
						|
    left["a"] = left["a"].astype("category")
 | 
						|
 | 
						|
    result = merge(left, right, how="inner", on="a")
 | 
						|
    expected = right.copy()
 | 
						|
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_merge_multiindex_columns():
 | 
						|
    # Issue #28518
 | 
						|
    # Verify that merging two dataframes give the expected labels
 | 
						|
    # The original cause of this issue come from a bug lexsort_depth and is tested in
 | 
						|
    # test_lexsort_depth
 | 
						|
 | 
						|
    letters = ["a", "b", "c", "d"]
 | 
						|
    numbers = ["1", "2", "3"]
 | 
						|
    index = MultiIndex.from_product((letters, numbers), names=["outer", "inner"])
 | 
						|
 | 
						|
    frame_x = DataFrame(columns=index)
 | 
						|
    frame_x["id"] = ""
 | 
						|
    frame_y = DataFrame(columns=index)
 | 
						|
    frame_y["id"] = ""
 | 
						|
 | 
						|
    l_suf = "_x"
 | 
						|
    r_suf = "_y"
 | 
						|
    result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf)))
 | 
						|
 | 
						|
    # Constructing the expected results
 | 
						|
    expected_labels = [letter + l_suf for letter in letters] + [
 | 
						|
        letter + r_suf for letter in letters
 | 
						|
    ]
 | 
						|
    expected_index = MultiIndex.from_product(
 | 
						|
        [expected_labels, numbers], names=["outer", "inner"]
 | 
						|
    )
 | 
						|
    expected = DataFrame(columns=expected_index)
 | 
						|
    expected["id"] = ""
 | 
						|
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_merge_datetime_upcast_dtype():
 | 
						|
    # https://github.com/pandas-dev/pandas/issues/31208
 | 
						|
    df1 = DataFrame({"x": ["a", "b", "c"], "y": ["1", "2", "4"]})
 | 
						|
    df2 = DataFrame(
 | 
						|
        {"y": ["1", "2", "3"], "z": pd.to_datetime(["2000", "2001", "2002"])}
 | 
						|
    )
 | 
						|
    result = merge(df1, df2, how="left", on="y")
 | 
						|
    expected = DataFrame(
 | 
						|
        {
 | 
						|
            "x": ["a", "b", "c"],
 | 
						|
            "y": ["1", "2", "4"],
 | 
						|
            "z": pd.to_datetime(["2000", "2001", "NaT"]),
 | 
						|
        }
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("n_categories", [5, 128])
 | 
						|
def test_categorical_non_unique_monotonic(n_categories):
 | 
						|
    # GH 28189
 | 
						|
    # With n_categories as 5, we test the int8 case is hit in libjoin,
 | 
						|
    # with n_categories as 128 we test the int16 case.
 | 
						|
    left_index = CategoricalIndex([0] + list(range(n_categories)))
 | 
						|
    df1 = DataFrame(range(n_categories + 1), columns=["value"], index=left_index)
 | 
						|
    df2 = DataFrame(
 | 
						|
        [[6]],
 | 
						|
        columns=["value"],
 | 
						|
        index=CategoricalIndex([0], categories=np.arange(n_categories)),
 | 
						|
    )
 | 
						|
 | 
						|
    result = merge(df1, df2, how="left", left_index=True, right_index=True)
 | 
						|
    expected = DataFrame(
 | 
						|
        [[i, 6.0] if i < 2 else [i, np.nan] for i in range(n_categories + 1)],
 | 
						|
        columns=["value_x", "value_y"],
 | 
						|
        index=left_index,
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
 | 
						|
def test_merge_join_categorical_multiindex():
 | 
						|
    # From issue 16627
 | 
						|
    a = {
 | 
						|
        "Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]),
 | 
						|
        "Int1": [0, 1, 0, 1, 0, 0],
 | 
						|
    }
 | 
						|
    a = DataFrame(a)
 | 
						|
 | 
						|
    b = {
 | 
						|
        "Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]),
 | 
						|
        "Int": [0, 0, 0, 1, 1, 1],
 | 
						|
        "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6],
 | 
						|
    }
 | 
						|
    b = DataFrame(b).set_index(["Cat", "Int"])["Factor"]
 | 
						|
 | 
						|
    expected = merge(
 | 
						|
        a,
 | 
						|
        b.reset_index(),
 | 
						|
        left_on=["Cat1", "Int1"],
 | 
						|
        right_on=["Cat", "Int"],
 | 
						|
        how="left",
 | 
						|
    )
 | 
						|
    expected = expected.drop(["Cat", "Int"], axis=1)
 | 
						|
    result = a.join(b, on=["Cat1", "Int1"])
 | 
						|
    tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
    # Same test, but with ordered categorical
 | 
						|
    a = {
 | 
						|
        "Cat1": Categorical(
 | 
						|
            ["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True
 | 
						|
        ),
 | 
						|
        "Int1": [0, 1, 0, 1, 0, 0],
 | 
						|
    }
 | 
						|
    a = DataFrame(a)
 | 
						|
 | 
						|
    b = {
 | 
						|
        "Cat": Categorical(
 | 
						|
            ["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True
 | 
						|
        ),
 | 
						|
        "Int": [0, 0, 0, 1, 1, 1],
 | 
						|
        "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6],
 | 
						|
    }
 | 
						|
    b = DataFrame(b).set_index(["Cat", "Int"])["Factor"]
 | 
						|
 | 
						|
    expected = merge(
 | 
						|
        a,
 | 
						|
        b.reset_index(),
 | 
						|
        left_on=["Cat1", "Int1"],
 | 
						|
        right_on=["Cat", "Int"],
 | 
						|
        how="left",
 | 
						|
    )
 | 
						|
    expected = expected.drop(["Cat", "Int"], axis=1)
 | 
						|
    result = a.join(b, on=["Cat1", "Int1"])
 | 
						|
    tm.assert_frame_equal(expected, result)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("func", ["merge", "merge_asof"])
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    ("kwargs", "err_msg"),
 | 
						|
    [
 | 
						|
        ({"left_on": "a", "left_index": True}, ["left_on", "left_index"]),
 | 
						|
        ({"right_on": "a", "right_index": True}, ["right_on", "right_index"]),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_merge_join_cols_error_reporting_duplicates(func, kwargs, err_msg):
 | 
						|
    # GH: 16228
 | 
						|
    left = DataFrame({"a": [1, 2], "b": [3, 4]})
 | 
						|
    right = DataFrame({"a": [1, 1], "c": [5, 6]})
 | 
						|
    msg = rf'Can only pass argument "{err_msg[0]}" OR "{err_msg[1]}" not both\.'
 | 
						|
    with pytest.raises(MergeError, match=msg):
 | 
						|
        getattr(pd, func)(left, right, **kwargs)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("func", ["merge", "merge_asof"])
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    ("kwargs", "err_msg"),
 | 
						|
    [
 | 
						|
        ({"left_on": "a"}, ["right_on", "right_index"]),
 | 
						|
        ({"right_on": "a"}, ["left_on", "left_index"]),
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_merge_join_cols_error_reporting_missing(func, kwargs, err_msg):
 | 
						|
    # GH: 16228
 | 
						|
    left = DataFrame({"a": [1, 2], "b": [3, 4]})
 | 
						|
    right = DataFrame({"a": [1, 1], "c": [5, 6]})
 | 
						|
    msg = rf'Must pass "{err_msg[0]}" OR "{err_msg[1]}"\.'
 | 
						|
    with pytest.raises(MergeError, match=msg):
 | 
						|
        getattr(pd, func)(left, right, **kwargs)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("func", ["merge", "merge_asof"])
 | 
						|
@pytest.mark.parametrize(
 | 
						|
    "kwargs",
 | 
						|
    [
 | 
						|
        {"right_index": True},
 | 
						|
        {"left_index": True},
 | 
						|
    ],
 | 
						|
)
 | 
						|
def test_merge_join_cols_error_reporting_on_and_index(func, kwargs):
 | 
						|
    # GH: 16228
 | 
						|
    left = DataFrame({"a": [1, 2], "b": [3, 4]})
 | 
						|
    right = DataFrame({"a": [1, 1], "c": [5, 6]})
 | 
						|
    msg = (
 | 
						|
        r'Can only pass argument "on" OR "left_index" '
 | 
						|
        r'and "right_index", not a combination of both\.'
 | 
						|
    )
 | 
						|
    with pytest.raises(MergeError, match=msg):
 | 
						|
        getattr(pd, func)(left, right, on="a", **kwargs)
 | 
						|
 | 
						|
 | 
						|
def test_merge_right_left_index():
 | 
						|
    # GH#38616
 | 
						|
    left = DataFrame({"x": [1, 1], "z": ["foo", "foo"]})
 | 
						|
    right = DataFrame({"x": [1, 1], "z": ["foo", "foo"]})
 | 
						|
    result = merge(left, right, how="right", left_index=True, right_on="x")
 | 
						|
    expected = DataFrame(
 | 
						|
        {
 | 
						|
            "x": [1, 1],
 | 
						|
            "x_x": [1, 1],
 | 
						|
            "z_x": ["foo", "foo"],
 | 
						|
            "x_y": [1, 1],
 | 
						|
            "z_y": ["foo", "foo"],
 | 
						|
        }
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_merge_result_empty_index_and_on():
 | 
						|
    # GH#33814
 | 
						|
    df1 = DataFrame({"a": [1], "b": [2]}).set_index(["a", "b"])
 | 
						|
    df2 = DataFrame({"b": [1]}).set_index(["b"])
 | 
						|
    expected = DataFrame({"a": [], "b": []}, dtype=np.int64).set_index(["a", "b"])
 | 
						|
    result = merge(df1, df2, left_on=["b"], right_index=True)
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    result = merge(df2, df1, left_index=True, right_on=["b"])
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_merge_suffixes_produce_dup_columns_warns():
 | 
						|
    # GH#22818
 | 
						|
    left = DataFrame({"a": [1, 2, 3], "b": 1, "b_x": 2})
 | 
						|
    right = DataFrame({"a": [1, 2, 3], "b": 2})
 | 
						|
    expected = DataFrame(
 | 
						|
        [[1, 1, 2, 2], [2, 1, 2, 2], [3, 1, 2, 2]], columns=["a", "b_x", "b_x", "b_y"]
 | 
						|
    )
 | 
						|
    with tm.assert_produces_warning(FutureWarning):
 | 
						|
        result = merge(left, right, on="a")
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    with tm.assert_produces_warning(FutureWarning):
 | 
						|
        merge(right, left, on="a", suffixes=("_y", "_x"))
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_merge_duplicate_columns_with_suffix_no_warning():
 | 
						|
    # GH#22818
 | 
						|
    # Do not raise warning when duplicates are caused by duplicates in origin
 | 
						|
    left = DataFrame([[1, 1, 1], [2, 2, 2]], columns=["a", "b", "b"])
 | 
						|
    right = DataFrame({"a": [1, 3], "b": 2})
 | 
						|
    result = merge(left, right, on="a")
 | 
						|
    expected = DataFrame([[1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_y"])
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_merge_duplicate_columns_with_suffix_causing_another_duplicate():
 | 
						|
    # GH#22818
 | 
						|
    # This should raise warning because suffixes cause another collision
 | 
						|
    left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "b_x"])
 | 
						|
    right = DataFrame({"a": [1, 3], "b": 2})
 | 
						|
    with tm.assert_produces_warning(FutureWarning):
 | 
						|
        result = merge(left, right, on="a")
 | 
						|
    expected = DataFrame([[1, 1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_x", "b_y"])
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_merge_string_float_column_result():
 | 
						|
    # GH 13353
 | 
						|
    df1 = DataFrame([[1, 2], [3, 4]], columns=pd.Index(["a", 114.0]))
 | 
						|
    df2 = DataFrame([[9, 10], [11, 12]], columns=["x", "y"])
 | 
						|
    result = merge(df2, df1, how="inner", left_index=True, right_index=True)
 | 
						|
    expected = DataFrame(
 | 
						|
        [[9, 10, 1, 2], [11, 12, 3, 4]], columns=pd.Index(["x", "y", "a", 114.0])
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_mergeerror_on_left_index_mismatched_dtypes():
 | 
						|
    # GH 22449
 | 
						|
    df_1 = DataFrame(data=["X"], columns=["C"], index=[22])
 | 
						|
    df_2 = DataFrame(data=["X"], columns=["C"], index=[999])
 | 
						|
    with pytest.raises(MergeError, match="Can only pass argument"):
 | 
						|
        merge(df_1, df_2, on=["C"], left_index=True)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.parametrize("dtype", [None, "Int64"])
 | 
						|
def test_merge_outer_with_NaN(dtype):
 | 
						|
    # GH#43550
 | 
						|
    left = DataFrame({"key": [1, 2], "col1": [1, 2]}, dtype=dtype)
 | 
						|
    right = DataFrame({"key": [np.nan, np.nan], "col2": [3, 4]}, dtype=dtype)
 | 
						|
    result = merge(left, right, on="key", how="outer")
 | 
						|
    expected = DataFrame(
 | 
						|
        {
 | 
						|
            "key": [1, 2, np.nan, np.nan],
 | 
						|
            "col1": [1, 2, np.nan, np.nan],
 | 
						|
            "col2": [np.nan, np.nan, 3, 4],
 | 
						|
        },
 | 
						|
        dtype=dtype,
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
    # switch left and right
 | 
						|
    result = merge(right, left, on="key", how="outer")
 | 
						|
    expected = DataFrame(
 | 
						|
        {
 | 
						|
            "key": [np.nan, np.nan, 1, 2],
 | 
						|
            "col2": [3, 4, np.nan, np.nan],
 | 
						|
            "col1": [np.nan, np.nan, 1, 2],
 | 
						|
        },
 | 
						|
        dtype=dtype,
 | 
						|
    )
 | 
						|
    tm.assert_frame_equal(result, expected)
 | 
						|
 | 
						|
 | 
						|
def test_merge_different_index_names():
 | 
						|
    # GH#45094
 | 
						|
    left = DataFrame({"a": [1]}, index=pd.Index([1], name="c"))
 | 
						|
    right = DataFrame({"a": [1]}, index=pd.Index([1], name="d"))
 | 
						|
    result = merge(left, right, left_on="c", right_on="d")
 | 
						|
    expected = DataFrame({"a_x": [1], "a_y": 1})
 | 
						|
    tm.assert_frame_equal(result, expected)
 |