476 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			476 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from __future__ import annotations
 | 
						|
 | 
						|
import random
 | 
						|
from typing import (
 | 
						|
    TYPE_CHECKING,
 | 
						|
    Hashable,
 | 
						|
)
 | 
						|
 | 
						|
import matplotlib.lines as mlines
 | 
						|
import matplotlib.patches as patches
 | 
						|
import numpy as np
 | 
						|
 | 
						|
from pandas.core.dtypes.missing import notna
 | 
						|
 | 
						|
from pandas.io.formats.printing import pprint_thing
 | 
						|
from pandas.plotting._matplotlib.style import get_standard_colors
 | 
						|
from pandas.plotting._matplotlib.tools import (
 | 
						|
    create_subplots,
 | 
						|
    do_adjust_figure,
 | 
						|
    maybe_adjust_figure,
 | 
						|
    set_ticks_props,
 | 
						|
)
 | 
						|
 | 
						|
if TYPE_CHECKING:
 | 
						|
    from matplotlib.axes import Axes
 | 
						|
    from matplotlib.figure import Figure
 | 
						|
 | 
						|
    from pandas import (
 | 
						|
        DataFrame,
 | 
						|
        Series,
 | 
						|
    )
 | 
						|
 | 
						|
 | 
						|
def scatter_matrix(
 | 
						|
    frame: DataFrame,
 | 
						|
    alpha=0.5,
 | 
						|
    figsize=None,
 | 
						|
    ax=None,
 | 
						|
    grid=False,
 | 
						|
    diagonal="hist",
 | 
						|
    marker=".",
 | 
						|
    density_kwds=None,
 | 
						|
    hist_kwds=None,
 | 
						|
    range_padding=0.05,
 | 
						|
    **kwds,
 | 
						|
):
 | 
						|
    df = frame._get_numeric_data()
 | 
						|
    n = df.columns.size
 | 
						|
    naxes = n * n
 | 
						|
    fig, axes = create_subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False)
 | 
						|
 | 
						|
    # no gaps between subplots
 | 
						|
    maybe_adjust_figure(fig, wspace=0, hspace=0)
 | 
						|
 | 
						|
    mask = notna(df)
 | 
						|
 | 
						|
    marker = _get_marker_compat(marker)
 | 
						|
 | 
						|
    hist_kwds = hist_kwds or {}
 | 
						|
    density_kwds = density_kwds or {}
 | 
						|
 | 
						|
    # GH 14855
 | 
						|
    kwds.setdefault("edgecolors", "none")
 | 
						|
 | 
						|
    boundaries_list = []
 | 
						|
    for a in df.columns:
 | 
						|
        values = df[a].values[mask[a].values]
 | 
						|
        rmin_, rmax_ = np.min(values), np.max(values)
 | 
						|
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2
 | 
						|
        boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
 | 
						|
 | 
						|
    for i, a in enumerate(df.columns):
 | 
						|
        for j, b in enumerate(df.columns):
 | 
						|
            ax = axes[i, j]
 | 
						|
 | 
						|
            if i == j:
 | 
						|
                values = df[a].values[mask[a].values]
 | 
						|
 | 
						|
                # Deal with the diagonal by drawing a histogram there.
 | 
						|
                if diagonal == "hist":
 | 
						|
                    ax.hist(values, **hist_kwds)
 | 
						|
 | 
						|
                elif diagonal in ("kde", "density"):
 | 
						|
                    from scipy.stats import gaussian_kde
 | 
						|
 | 
						|
                    y = values
 | 
						|
                    gkde = gaussian_kde(y)
 | 
						|
                    ind = np.linspace(y.min(), y.max(), 1000)
 | 
						|
                    ax.plot(ind, gkde.evaluate(ind), **density_kwds)
 | 
						|
 | 
						|
                ax.set_xlim(boundaries_list[i])
 | 
						|
 | 
						|
            else:
 | 
						|
                common = (mask[a] & mask[b]).values
 | 
						|
 | 
						|
                ax.scatter(
 | 
						|
                    df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds
 | 
						|
                )
 | 
						|
 | 
						|
                ax.set_xlim(boundaries_list[j])
 | 
						|
                ax.set_ylim(boundaries_list[i])
 | 
						|
 | 
						|
            ax.set_xlabel(b)
 | 
						|
            ax.set_ylabel(a)
 | 
						|
 | 
						|
            if j != 0:
 | 
						|
                ax.yaxis.set_visible(False)
 | 
						|
            if i != n - 1:
 | 
						|
                ax.xaxis.set_visible(False)
 | 
						|
 | 
						|
    if len(df.columns) > 1:
 | 
						|
        lim1 = boundaries_list[0]
 | 
						|
        locs = axes[0][1].yaxis.get_majorticklocs()
 | 
						|
        locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
 | 
						|
        adj = (locs - lim1[0]) / (lim1[1] - lim1[0])
 | 
						|
 | 
						|
        lim0 = axes[0][0].get_ylim()
 | 
						|
        adj = adj * (lim0[1] - lim0[0]) + lim0[0]
 | 
						|
        axes[0][0].yaxis.set_ticks(adj)
 | 
						|
 | 
						|
        if np.all(locs == locs.astype(int)):
 | 
						|
            # if all ticks are int
 | 
						|
            locs = locs.astype(int)
 | 
						|
        axes[0][0].yaxis.set_ticklabels(locs)
 | 
						|
 | 
						|
    set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
 | 
						|
 | 
						|
    return axes
 | 
						|
 | 
						|
 | 
						|
def _get_marker_compat(marker):
 | 
						|
    if marker not in mlines.lineMarkers:
 | 
						|
        return "o"
 | 
						|
    return marker
 | 
						|
 | 
						|
 | 
						|
def radviz(
 | 
						|
    frame: DataFrame,
 | 
						|
    class_column,
 | 
						|
    ax: Axes | None = None,
 | 
						|
    color=None,
 | 
						|
    colormap=None,
 | 
						|
    **kwds,
 | 
						|
) -> Axes:
 | 
						|
    import matplotlib.pyplot as plt
 | 
						|
 | 
						|
    def normalize(series):
 | 
						|
        a = min(series)
 | 
						|
        b = max(series)
 | 
						|
        return (series - a) / (b - a)
 | 
						|
 | 
						|
    n = len(frame)
 | 
						|
    classes = frame[class_column].drop_duplicates()
 | 
						|
    class_col = frame[class_column]
 | 
						|
    df = frame.drop(class_column, axis=1).apply(normalize)
 | 
						|
 | 
						|
    if ax is None:
 | 
						|
        ax = plt.gca()
 | 
						|
        ax.set_xlim(-1, 1)
 | 
						|
        ax.set_ylim(-1, 1)
 | 
						|
 | 
						|
    to_plot: dict[Hashable, list[list]] = {}
 | 
						|
    colors = get_standard_colors(
 | 
						|
        num_colors=len(classes), colormap=colormap, color_type="random", color=color
 | 
						|
    )
 | 
						|
 | 
						|
    for kls in classes:
 | 
						|
        to_plot[kls] = [[], []]
 | 
						|
 | 
						|
    m = len(frame.columns) - 1
 | 
						|
    s = np.array(
 | 
						|
        [(np.cos(t), np.sin(t)) for t in [2 * np.pi * (i / m) for i in range(m)]]
 | 
						|
    )
 | 
						|
 | 
						|
    for i in range(n):
 | 
						|
        row = df.iloc[i].values
 | 
						|
        row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
 | 
						|
        y = (s * row_).sum(axis=0) / row.sum()
 | 
						|
        kls = class_col.iat[i]
 | 
						|
        to_plot[kls][0].append(y[0])
 | 
						|
        to_plot[kls][1].append(y[1])
 | 
						|
 | 
						|
    for i, kls in enumerate(classes):
 | 
						|
        ax.scatter(
 | 
						|
            to_plot[kls][0],
 | 
						|
            to_plot[kls][1],
 | 
						|
            color=colors[i],
 | 
						|
            label=pprint_thing(kls),
 | 
						|
            **kwds,
 | 
						|
        )
 | 
						|
    ax.legend()
 | 
						|
 | 
						|
    ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none"))
 | 
						|
 | 
						|
    for xy, name in zip(s, df.columns):
 | 
						|
 | 
						|
        ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray"))
 | 
						|
 | 
						|
        if xy[0] < 0.0 and xy[1] < 0.0:
 | 
						|
            ax.text(
 | 
						|
                xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small"
 | 
						|
            )
 | 
						|
        elif xy[0] < 0.0 and xy[1] >= 0.0:
 | 
						|
            ax.text(
 | 
						|
                xy[0] - 0.025,
 | 
						|
                xy[1] + 0.025,
 | 
						|
                name,
 | 
						|
                ha="right",
 | 
						|
                va="bottom",
 | 
						|
                size="small",
 | 
						|
            )
 | 
						|
        elif xy[0] >= 0.0 and xy[1] < 0.0:
 | 
						|
            ax.text(
 | 
						|
                xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small"
 | 
						|
            )
 | 
						|
        elif xy[0] >= 0.0 and xy[1] >= 0.0:
 | 
						|
            ax.text(
 | 
						|
                xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small"
 | 
						|
            )
 | 
						|
 | 
						|
    ax.axis("equal")
 | 
						|
    return ax
 | 
						|
 | 
						|
 | 
						|
def andrews_curves(
 | 
						|
    frame: DataFrame,
 | 
						|
    class_column,
 | 
						|
    ax: Axes | None = None,
 | 
						|
    samples: int = 200,
 | 
						|
    color=None,
 | 
						|
    colormap=None,
 | 
						|
    **kwds,
 | 
						|
) -> Axes:
 | 
						|
    import matplotlib.pyplot as plt
 | 
						|
 | 
						|
    def function(amplitudes):
 | 
						|
        def f(t):
 | 
						|
            x1 = amplitudes[0]
 | 
						|
            result = x1 / np.sqrt(2.0)
 | 
						|
 | 
						|
            # Take the rest of the coefficients and resize them
 | 
						|
            # appropriately. Take a copy of amplitudes as otherwise numpy
 | 
						|
            # deletes the element from amplitudes itself.
 | 
						|
            coeffs = np.delete(np.copy(amplitudes), 0)
 | 
						|
            coeffs = np.resize(coeffs, (int((coeffs.size + 1) / 2), 2))
 | 
						|
 | 
						|
            # Generate the harmonics and arguments for the sin and cos
 | 
						|
            # functions.
 | 
						|
            harmonics = np.arange(0, coeffs.shape[0]) + 1
 | 
						|
            trig_args = np.outer(harmonics, t)
 | 
						|
 | 
						|
            result += np.sum(
 | 
						|
                coeffs[:, 0, np.newaxis] * np.sin(trig_args)
 | 
						|
                + coeffs[:, 1, np.newaxis] * np.cos(trig_args),
 | 
						|
                axis=0,
 | 
						|
            )
 | 
						|
            return result
 | 
						|
 | 
						|
        return f
 | 
						|
 | 
						|
    n = len(frame)
 | 
						|
    class_col = frame[class_column]
 | 
						|
    classes = frame[class_column].drop_duplicates()
 | 
						|
    df = frame.drop(class_column, axis=1)
 | 
						|
    t = np.linspace(-np.pi, np.pi, samples)
 | 
						|
    used_legends: set[str] = set()
 | 
						|
 | 
						|
    color_values = get_standard_colors(
 | 
						|
        num_colors=len(classes), colormap=colormap, color_type="random", color=color
 | 
						|
    )
 | 
						|
    colors = dict(zip(classes, color_values))
 | 
						|
    if ax is None:
 | 
						|
        ax = plt.gca()
 | 
						|
        ax.set_xlim(-np.pi, np.pi)
 | 
						|
    for i in range(n):
 | 
						|
        row = df.iloc[i].values
 | 
						|
        f = function(row)
 | 
						|
        y = f(t)
 | 
						|
        kls = class_col.iat[i]
 | 
						|
        label = pprint_thing(kls)
 | 
						|
        if label not in used_legends:
 | 
						|
            used_legends.add(label)
 | 
						|
            ax.plot(t, y, color=colors[kls], label=label, **kwds)
 | 
						|
        else:
 | 
						|
            ax.plot(t, y, color=colors[kls], **kwds)
 | 
						|
 | 
						|
    ax.legend(loc="upper right")
 | 
						|
    ax.grid()
 | 
						|
    return ax
 | 
						|
 | 
						|
 | 
						|
def bootstrap_plot(
 | 
						|
    series: Series,
 | 
						|
    fig: Figure | None = None,
 | 
						|
    size: int = 50,
 | 
						|
    samples: int = 500,
 | 
						|
    **kwds,
 | 
						|
) -> Figure:
 | 
						|
 | 
						|
    import matplotlib.pyplot as plt
 | 
						|
 | 
						|
    # TODO: is the failure mentioned below still relevant?
 | 
						|
    # random.sample(ndarray, int) fails on python 3.3, sigh
 | 
						|
    data = list(series.values)
 | 
						|
    samplings = [random.sample(data, size) for _ in range(samples)]
 | 
						|
 | 
						|
    means = np.array([np.mean(sampling) for sampling in samplings])
 | 
						|
    medians = np.array([np.median(sampling) for sampling in samplings])
 | 
						|
    midranges = np.array(
 | 
						|
        [(min(sampling) + max(sampling)) * 0.5 for sampling in samplings]
 | 
						|
    )
 | 
						|
    if fig is None:
 | 
						|
        fig = plt.figure()
 | 
						|
    x = list(range(samples))
 | 
						|
    axes = []
 | 
						|
    ax1 = fig.add_subplot(2, 3, 1)
 | 
						|
    ax1.set_xlabel("Sample")
 | 
						|
    axes.append(ax1)
 | 
						|
    ax1.plot(x, means, **kwds)
 | 
						|
    ax2 = fig.add_subplot(2, 3, 2)
 | 
						|
    ax2.set_xlabel("Sample")
 | 
						|
    axes.append(ax2)
 | 
						|
    ax2.plot(x, medians, **kwds)
 | 
						|
    ax3 = fig.add_subplot(2, 3, 3)
 | 
						|
    ax3.set_xlabel("Sample")
 | 
						|
    axes.append(ax3)
 | 
						|
    ax3.plot(x, midranges, **kwds)
 | 
						|
    ax4 = fig.add_subplot(2, 3, 4)
 | 
						|
    ax4.set_xlabel("Mean")
 | 
						|
    axes.append(ax4)
 | 
						|
    ax4.hist(means, **kwds)
 | 
						|
    ax5 = fig.add_subplot(2, 3, 5)
 | 
						|
    ax5.set_xlabel("Median")
 | 
						|
    axes.append(ax5)
 | 
						|
    ax5.hist(medians, **kwds)
 | 
						|
    ax6 = fig.add_subplot(2, 3, 6)
 | 
						|
    ax6.set_xlabel("Midrange")
 | 
						|
    axes.append(ax6)
 | 
						|
    ax6.hist(midranges, **kwds)
 | 
						|
    for axis in axes:
 | 
						|
        plt.setp(axis.get_xticklabels(), fontsize=8)
 | 
						|
        plt.setp(axis.get_yticklabels(), fontsize=8)
 | 
						|
    if do_adjust_figure(fig):
 | 
						|
        plt.tight_layout()
 | 
						|
    return fig
 | 
						|
 | 
						|
 | 
						|
def parallel_coordinates(
 | 
						|
    frame: DataFrame,
 | 
						|
    class_column,
 | 
						|
    cols=None,
 | 
						|
    ax: Axes | None = None,
 | 
						|
    color=None,
 | 
						|
    use_columns=False,
 | 
						|
    xticks=None,
 | 
						|
    colormap=None,
 | 
						|
    axvlines: bool = True,
 | 
						|
    axvlines_kwds=None,
 | 
						|
    sort_labels: bool = False,
 | 
						|
    **kwds,
 | 
						|
) -> Axes:
 | 
						|
    import matplotlib.pyplot as plt
 | 
						|
 | 
						|
    if axvlines_kwds is None:
 | 
						|
        axvlines_kwds = {"linewidth": 1, "color": "black"}
 | 
						|
 | 
						|
    n = len(frame)
 | 
						|
    classes = frame[class_column].drop_duplicates()
 | 
						|
    class_col = frame[class_column]
 | 
						|
 | 
						|
    if cols is None:
 | 
						|
        df = frame.drop(class_column, axis=1)
 | 
						|
    else:
 | 
						|
        df = frame[cols]
 | 
						|
 | 
						|
    used_legends: set[str] = set()
 | 
						|
 | 
						|
    ncols = len(df.columns)
 | 
						|
 | 
						|
    # determine values to use for xticks
 | 
						|
    if use_columns is True:
 | 
						|
        if not np.all(np.isreal(list(df.columns))):
 | 
						|
            raise ValueError("Columns must be numeric to be used as xticks")
 | 
						|
        x = df.columns
 | 
						|
    elif xticks is not None:
 | 
						|
        if not np.all(np.isreal(xticks)):
 | 
						|
            raise ValueError("xticks specified must be numeric")
 | 
						|
        elif len(xticks) != ncols:
 | 
						|
            raise ValueError("Length of xticks must match number of columns")
 | 
						|
        x = xticks
 | 
						|
    else:
 | 
						|
        x = list(range(ncols))
 | 
						|
 | 
						|
    if ax is None:
 | 
						|
        ax = plt.gca()
 | 
						|
 | 
						|
    color_values = get_standard_colors(
 | 
						|
        num_colors=len(classes), colormap=colormap, color_type="random", color=color
 | 
						|
    )
 | 
						|
 | 
						|
    if sort_labels:
 | 
						|
        classes = sorted(classes)
 | 
						|
        color_values = sorted(color_values)
 | 
						|
    colors = dict(zip(classes, color_values))
 | 
						|
 | 
						|
    for i in range(n):
 | 
						|
        y = df.iloc[i].values
 | 
						|
        kls = class_col.iat[i]
 | 
						|
        label = pprint_thing(kls)
 | 
						|
        if label not in used_legends:
 | 
						|
            used_legends.add(label)
 | 
						|
            ax.plot(x, y, color=colors[kls], label=label, **kwds)
 | 
						|
        else:
 | 
						|
            ax.plot(x, y, color=colors[kls], **kwds)
 | 
						|
 | 
						|
    if axvlines:
 | 
						|
        for i in x:
 | 
						|
            ax.axvline(i, **axvlines_kwds)
 | 
						|
 | 
						|
    ax.set_xticks(x)
 | 
						|
    ax.set_xticklabels(df.columns)
 | 
						|
    ax.set_xlim(x[0], x[-1])
 | 
						|
    ax.legend(loc="upper right")
 | 
						|
    ax.grid()
 | 
						|
    return ax
 | 
						|
 | 
						|
 | 
						|
def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Axes:
 | 
						|
    # workaround because `c='b'` is hardcoded in matplotlib's scatter method
 | 
						|
    import matplotlib.pyplot as plt
 | 
						|
 | 
						|
    kwds.setdefault("c", plt.rcParams["patch.facecolor"])
 | 
						|
 | 
						|
    data = series.values
 | 
						|
    y1 = data[:-lag]
 | 
						|
    y2 = data[lag:]
 | 
						|
    if ax is None:
 | 
						|
        ax = plt.gca()
 | 
						|
    ax.set_xlabel("y(t)")
 | 
						|
    ax.set_ylabel(f"y(t + {lag})")
 | 
						|
    ax.scatter(y1, y2, **kwds)
 | 
						|
    return ax
 | 
						|
 | 
						|
 | 
						|
def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwds) -> Axes:
 | 
						|
    import matplotlib.pyplot as plt
 | 
						|
 | 
						|
    n = len(series)
 | 
						|
    data = np.asarray(series)
 | 
						|
    if ax is None:
 | 
						|
        ax = plt.gca()
 | 
						|
        ax.set_xlim(1, n)
 | 
						|
        ax.set_ylim(-1.0, 1.0)
 | 
						|
    mean = np.mean(data)
 | 
						|
    c0 = np.sum((data - mean) ** 2) / n
 | 
						|
 | 
						|
    def r(h):
 | 
						|
        return ((data[: n - h] - mean) * (data[h:] - mean)).sum() / n / c0
 | 
						|
 | 
						|
    x = np.arange(n) + 1
 | 
						|
    y = [r(loc) for loc in x]
 | 
						|
    z95 = 1.959963984540054
 | 
						|
    z99 = 2.5758293035489004
 | 
						|
    ax.axhline(y=z99 / np.sqrt(n), linestyle="--", color="grey")
 | 
						|
    ax.axhline(y=z95 / np.sqrt(n), color="grey")
 | 
						|
    ax.axhline(y=0.0, color="black")
 | 
						|
    ax.axhline(y=-z95 / np.sqrt(n), color="grey")
 | 
						|
    ax.axhline(y=-z99 / np.sqrt(n), linestyle="--", color="grey")
 | 
						|
    ax.set_xlabel("Lag")
 | 
						|
    ax.set_ylabel("Autocorrelation")
 | 
						|
    ax.plot(x, y, **kwds)
 | 
						|
    if "label" in kwds:
 | 
						|
        ax.legend()
 | 
						|
    ax.grid()
 | 
						|
    return ax
 |