Feature-Extraction/dist/client/mne/rank.py

"""Some utility functions for rank estimation."""

# Authors: The MNE-Python contributors.
# License: BSD-3-Clause
# Copyright the MNE-Python contributors.

import numpy as np
from scipy import linalg

from ._fiff.meas_info import Info, _simplify_info
from ._fiff.pick import _picks_by_type, _picks_to_idx, pick_channels_cov, pick_info
from ._fiff.proj import make_projector
from .defaults import _handle_default
from .utils import (
    _apply_scaling_cov,
    _check_on_missing,
    _check_rank,
    _compute_row_norms,
    _on_missing,
    _pl,
    _scaled_array,
    _undo_scaling_cov,
    _validate_type,
    fill_doc,
    logger,
    verbose,
    warn,
)


@verbose
def estimate_rank(
    data,
    tol="auto",
    return_singular=False,
    norm=True,
    tol_kind="absolute",
    verbose=None,
):
    """Estimate the rank of data.

    This function will normalize the rows of the data (typically
    channels or vertices) such that non-zero singular values
    should be close to one.

    Parameters
    ----------
    data : array
        Data to estimate the rank of (should be 2-dimensional).
    %(tol_rank)s
    return_singular : bool
        If True, also return the singular values that were used
        to determine the rank.
    norm : bool
        If True, data will be scaled by their estimated row-wise norm.
        Else data are assumed to be scaled. Defaults to True.
    %(tol_kind_rank)s

    Returns
    -------
    rank : int
        Estimated rank of the data.
    s : array
        If return_singular is True, the singular values that were
        thresholded to determine the rank are also returned.
    """
    if norm:
        data = data.copy()  # operate on a copy
        norms = _compute_row_norms(data)
        data /= norms[:, np.newaxis]
    s = linalg.svdvals(data)
    rank = _estimate_rank_from_s(s, tol, tol_kind)
    if return_singular is True:
        return rank, s
    else:
        return rank


def _estimate_rank_from_s(s, tol="auto", tol_kind="absolute"):
    """Estimate the rank of a matrix from its singular values.

    Parameters
    ----------
    s : ndarray, shape (..., ndim)
        The singular values of the matrix.
    tol : float | ``'auto'``
        Tolerance for singular values to consider non-zero in calculating the
        rank. Can be 'auto' to use the same thresholding as
        ``scipy.linalg.orth`` (assuming np.float64 datatype) adjusted
        by a factor of 2.
    tol_kind : str
        Can be ``"absolute"`` or ``"relative"``.

    Returns
    -------
    rank : ndarray, shape (...)
        The estimated rank.
    """
    s = np.array(s, float)
    max_s = np.amax(s, axis=-1)
    if isinstance(tol, str):
        if tol not in ("auto", "float32"):
            raise ValueError(f'tol must be "auto" or float, got {repr(tol)}')
        # XXX this should be float32 probably due to how we save and
        # load data, but it breaks test_make_inverse_operator (!)
        # The factor of 2 gets test_compute_covariance_auto_reg[None]
        # to pass without breaking minimum norm tests. :(
        # Passing 'float32' is a hack workaround for test_maxfilter_get_rank :(
        if tol == "float32":
            eps = np.finfo(np.float32).eps
        else:
            eps = np.finfo(np.float64).eps
        tol = s.shape[-1] * max_s * eps
        if s.ndim == 1:  # typical
            logger.info(
                "    Using tolerance %0.2g (%0.2g eps * %d dim * %0.2g"
                "  max singular value)" % (tol, eps, len(s), max_s)
            )
    elif not (isinstance(tol, np.ndarray) and tol.dtype.kind == "f"):
        tol = float(tol)
        if tol_kind == "relative":
            tol = tol * max_s

    rank = np.sum(s > tol, axis=-1)
    return rank


def _estimate_rank_raw(
    raw, picks=None, tol=1e-4, scalings="norm", with_ref_meg=False, tol_kind="absolute"
):
    """Aid the transition away from raw.estimate_rank."""
    if picks is None:
        picks = _picks_to_idx(raw.info, picks, with_ref_meg=with_ref_meg)
    # conveniency wrapper to expose the expert "tol" option + scalings options
    return _estimate_rank_meeg_signals(
        raw[picks][0], pick_info(raw.info, picks), scalings, tol, False, tol_kind
    )


@fill_doc
def _estimate_rank_meeg_signals(
    data,
    info,
    scalings,
    tol="auto",
    return_singular=False,
    tol_kind="absolute",
    log_ch_type=None,
):
    """Estimate rank for M/EEG data.

    Parameters
    ----------
    data : np.ndarray of float, shape(n_channels, n_samples)
        The M/EEG signals.
    %(info_not_none)s
    scalings : dict | ``'norm'`` | np.ndarray | None
        The rescaling method to be applied. If dict, it will override the
        following default dict:

            dict(mag=1e15, grad=1e13, eeg=1e6)

        If ``'norm'`` data will be scaled by channel-wise norms. If array,
        pre-specified norms will be used. If None, no scaling will be applied.
    tol : float | str
        Tolerance. See ``estimate_rank``.
    return_singular : bool
        If True, also return the singular values that were used
        to determine the rank.
    tol_kind : str
        Tolerance kind. See ``estimate_rank``.

    Returns
    -------
    rank : int
        Estimated rank of the data.
    s : array
        If return_singular is True, the singular values that were
        thresholded to determine the rank are also returned.
    """
    picks_list = _picks_by_type(info)
    if data.shape[1] < data.shape[0]:
        ValueError(
            "You've got fewer samples than channels, your "
            "rank estimate might be inaccurate."
        )
    with _scaled_array(data, picks_list, scalings):
        out = estimate_rank(
            data,
            tol=tol,
            norm=False,
            return_singular=return_singular,
            tol_kind=tol_kind,
        )
    rank = out[0] if isinstance(out, tuple) else out
    if log_ch_type is None:
        ch_type = " + ".join(list(zip(*picks_list))[0])
    else:
        ch_type = log_ch_type
    logger.info("    Estimated rank (%s): %d" % (ch_type, rank))
    return out


@verbose
def _estimate_rank_meeg_cov(
    data,
    info,
    scalings,
    tol="auto",
    return_singular=False,
    *,
    log_ch_type=None,
    verbose=None,
):
    """Estimate rank of M/EEG covariance data, given the covariance.

    Parameters
    ----------
    data : np.ndarray of float, shape (n_channels, n_channels)
        The M/EEG covariance.
    %(info_not_none)s
    scalings : dict | 'norm' | np.ndarray | None
        The rescaling method to be applied. If dict, it will override the
        following default dict:

            dict(mag=1e12, grad=1e11, eeg=1e5)

        If 'norm' data will be scaled by channel-wise norms. If array,
        pre-specified norms will be used. If None, no scaling will be applied.
    tol : float | str
        Tolerance. See ``estimate_rank``.
    return_singular : bool
        If True, also return the singular values that were used
        to determine the rank.

    Returns
    -------
    rank : int
        Estimated rank of the data.
    s : array
        If return_singular is True, the singular values that were
        thresholded to determine the rank are also returned.
    """
    picks_list = _picks_by_type(info, exclude=[])
    scalings = _handle_default("scalings_cov_rank", scalings)
    _apply_scaling_cov(data, picks_list, scalings)
    if data.shape[1] < data.shape[0]:
        ValueError(
            "You've got fewer samples than channels, your "
            "rank estimate might be inaccurate."
        )
    out = estimate_rank(data, tol=tol, norm=False, return_singular=return_singular)
    rank = out[0] if isinstance(out, tuple) else out
    if log_ch_type is None:
        ch_type_ = " + ".join(list(zip(*picks_list))[0])
    else:
        ch_type_ = log_ch_type
    logger.info(f"    Estimated rank ({ch_type_}): {rank}")
    _undo_scaling_cov(data, picks_list, scalings)
    return out


@verbose
def _get_rank_sss(
    inst, msg="You should use data-based rank estimate instead", verbose=None
):
    """Look up rank from SSS data.

    .. note::
        Throws an error if SSS has not been applied.

    Parameters
    ----------
    inst : instance of Raw, Epochs or Evoked, or Info
        Any MNE object with an .info attribute

    Returns
    -------
    rank : int
        The numerical rank as predicted by the number of SSS
        components.
    """
    # XXX this is too basic for movement compensated data
    # https://github.com/mne-tools/mne-python/issues/4676
    info = inst if isinstance(inst, Info) else inst.info
    del inst

    proc_info = info.get("proc_history", [])
    if len(proc_info) > 1:
        logger.info("Found multiple SSS records. Using the first.")
    if (
        len(proc_info) == 0
        or "max_info" not in proc_info[0]
        or "in_order" not in proc_info[0]["max_info"]["sss_info"]
    ):
        raise ValueError(
            f'Could not find Maxfilter information in info["proc_history"]. {msg}'
        )
    proc_info = proc_info[0]
    max_info = proc_info["max_info"]
    inside = max_info["sss_info"]["in_order"]
    nfree = (inside + 1) ** 2 - 1
    nfree -= (
        len(max_info["sss_info"]["components"][:nfree])
        - max_info["sss_info"]["components"][:nfree].sum()
    )
    return nfree


def _info_rank(info, ch_type, picks, rank):
    if ch_type in ["meg", "mag", "grad"] and rank != "full":
        try:
            return _get_rank_sss(info)
        except ValueError:
            pass
    return len(picks)


def _compute_rank_int(inst, *args, **kwargs):
    """Wrap compute_rank but yield an int."""
    # XXX eventually we should unify how channel types are handled
    # so that we don't need to do this, or we do it everywhere.
    # Using pca=True in compute_whitener might help.
    return sum(compute_rank(inst, *args, **kwargs).values())


@verbose
def compute_rank(
    inst,
    rank=None,
    scalings=None,
    info=None,
    tol="auto",
    proj=True,
    tol_kind="absolute",
    on_rank_mismatch="ignore",
    verbose=None,
):
    """Compute the rank of data or noise covariance.

    This function will normalize the rows of the data (typically
    channels or vertices) such that non-zero singular values
    should be close to one. It operates on :term:`data channels` only.

    Parameters
    ----------
    inst : instance of Raw, Epochs, or Covariance
        Raw measurements to compute the rank from or the covariance.
    %(rank_none)s
    scalings : dict | None (default None)
        Defaults to ``dict(mag=1e15, grad=1e13, eeg=1e6)``.
        These defaults will scale different channel types
        to comparable values.
    %(info)s Only necessary if ``inst`` is a :class:`mne.Covariance`
        object (since this does not provide ``inst.info``).
    %(tol_rank)s
    proj : bool
        If True, all projs in ``inst`` and ``info`` will be applied or
        considered when ``rank=None`` or ``rank='info'``.
    %(tol_kind_rank)s
    %(on_rank_mismatch)s
    %(verbose)s

    Returns
    -------
    rank : dict
        Estimated rank of the data for each channel type.
        To get the total rank, you can use ``sum(rank.values())``.

    Notes
    -----
    .. versionadded:: 0.18
    """
    return _compute_rank(
        inst=inst,
        rank=rank,
        scalings=scalings,
        info=info,
        tol=tol,
        proj=proj,
        tol_kind=tol_kind,
        on_rank_mismatch=on_rank_mismatch,
    )


@verbose
def _compute_rank(
    inst,
    rank=None,
    scalings=None,
    info=None,
    *,
    tol="auto",
    proj=True,
    tol_kind="absolute",
    on_rank_mismatch="ignore",
    log_ch_type=None,
    verbose=None,
):
    from .cov import Covariance
    from .epochs import BaseEpochs
    from .io import BaseRaw

    rank = _check_rank(rank)
    scalings = _handle_default("scalings_cov_rank", scalings)
    _check_on_missing(on_rank_mismatch, "on_rank_mismatch")

    if isinstance(inst, Covariance):
        inst_type = "covariance"
        if info is None:
            raise ValueError("info cannot be None if inst is a Covariance.")
        # Reset bads as it's already taken into account in inst['names']
        info = info.copy()
        info["bads"] = []
        inst = pick_channels_cov(
            inst,
            set(inst["names"]) & set(info["ch_names"]),
            exclude=info["bads"] + inst["bads"],
            ordered=False,
        )
        if info["ch_names"] != inst["names"]:
            info = pick_info(
                info, [info["ch_names"].index(name) for name in inst["names"]]
            )
    else:
        info = inst.info
        inst_type = "data"
    logger.info(f"Computing rank from {inst_type} with rank={repr(rank)}")

    _validate_type(rank, (str, dict, None), "rank")
    if isinstance(rank, str):  # string, either 'info' or 'full'
        rank_type = "info"
        info_type = rank
        rank = dict()
    else:  # None or dict
        rank_type = "estimated"
        if rank is None:
            rank = dict()

    simple_info = _simplify_info(info)
    picks_list = _picks_by_type(info, meg_combined=True, ref_meg=False, exclude="bads")
    for ch_type, picks in picks_list:
        est_verbose = None
        if ch_type in rank:
            # raise an error of user-supplied rank exceeds number of channels
            if rank[ch_type] > len(picks):
                raise ValueError(
                    f"rank[{repr(ch_type)}]={rank[ch_type]} exceeds the number"
                    f" of channels ({len(picks)})"
                )
            # special case: if whitening a covariance, check the passed rank
            # against the estimated one
            est_verbose = False
            if not (
                on_rank_mismatch != "ignore"
                and rank_type == "estimated"
                and ch_type == "meg"
                and isinstance(inst, Covariance)
                and not inst["diag"]
            ):
                continue
        ch_names = [info["ch_names"][pick] for pick in picks]
        n_chan = len(ch_names)
        if proj:
            proj_op, n_proj, _ = make_projector(info["projs"], ch_names)
        else:
            proj_op, n_proj = None, 0
        if log_ch_type is None:
            ch_type_ = ch_type.upper()
        else:
            ch_type_ = log_ch_type
        if rank_type == "info":
            # use info
            this_rank = _info_rank(info, ch_type, picks, info_type)
            if info_type != "full":
                this_rank -= n_proj
                logger.info(
                    f"    {ch_type_}: rank {this_rank} after "
                    f"{n_proj} projector{_pl(n_proj)} applied to "
                    f"{n_chan} channel{_pl(n_chan)}"
                )
            else:
                logger.info(f"    {ch_type_}: rank {this_rank} from info")
        else:
            # Use empirical estimation
            assert rank_type == "estimated"
            if isinstance(inst, (BaseRaw, BaseEpochs)):
                if isinstance(inst, BaseRaw):
                    data = inst.get_data(picks, reject_by_annotation="omit")
                else:  # isinstance(inst, BaseEpochs):
                    data = np.concatenate(inst.get_data(picks), axis=1)
                if proj:
                    data = np.dot(proj_op, data)
                this_rank = _estimate_rank_meeg_signals(
                    data,
                    pick_info(simple_info, picks),
                    scalings,
                    tol,
                    False,
                    tol_kind,
                    log_ch_type=log_ch_type,
                )
            else:
                assert isinstance(inst, Covariance)
                if inst["diag"]:
                    this_rank = (inst["data"][picks] > 0).sum() - n_proj
                else:
                    data = inst["data"][picks][:, picks]
                    if proj:
                        data = np.dot(np.dot(proj_op, data), proj_op.T)

                    this_rank, sing = _estimate_rank_meeg_cov(
                        data,
                        pick_info(simple_info, picks),
                        scalings,
                        tol,
                        return_singular=True,
                        log_ch_type=log_ch_type,
                        verbose=est_verbose,
                    )
                    if ch_type in rank:
                        ratio = sing[this_rank - 1] / sing[rank[ch_type] - 1]
                        if ratio > 100:
                            msg = (
                                f"The passed rank[{repr(ch_type)}]="
                                f"{rank[ch_type]} exceeds the estimated rank "
                                f"of the noise covariance ({this_rank}) "
                                f"leading to a potential increase in "
                                f"noise during whitening by a factor "
                                f"of {np.sqrt(ratio):0.1g}. Ensure that the "
                                f"rank correctly corresponds to that of the "
                                f"given noise covariance matrix."
                            )
                            _on_missing(on_rank_mismatch, msg, "on_rank_mismatch")
                        continue
            this_info_rank = _info_rank(info, ch_type, picks, "info")
            logger.info(
                f"    {ch_type_}: rank {this_rank} computed from "
                f"{n_chan} data channel{_pl(n_chan)} with "
                f"{n_proj} projector{_pl(n_proj)}"
            )
            if this_rank > this_info_rank:
                warn(
                    "Something went wrong in the data-driven estimation of "
                    "the data rank as it exceeds the theoretical rank from "
                    'the info (%d > %d). Consider setting rank to "auto" or '
                    "setting it explicitly as an integer." % (this_rank, this_info_rank)
                )
        if ch_type not in rank:
            rank[ch_type] = int(this_rank)

    return rank