# Authors: The MNE-Python contributors.
# License: BSD-3-Clause
# Copyright the MNE-Python contributors.

import logging

import numpy as np
from scipy.sparse import issparse

from ..fixes import _get_check_scoring
from ..parallel import parallel_func
from ..utils import ProgressBar, _parse_verbose, array_split_idx, fill_doc, verbose
from .base import BaseEstimator, _check_estimator
from .mixin import TransformerMixin


@fill_doc
class SlidingEstimator(BaseEstimator, TransformerMixin):
    """Search Light.

    Fit, predict and score a series of models to each subset of the dataset
    along the last dimension. Each entry in the last dimension is referred
    to as a task.

    Parameters
    ----------
    %(base_estimator)s
    %(scoring)s
    %(n_jobs)s
    %(position)s
    %(allow_2d)s
    %(verbose)s

    Attributes
    ----------
    estimators_ : array-like, shape (n_tasks,)
        List of fitted scikit-learn estimators (one per task).
    """

    @verbose
    def __init__(
        self,
        base_estimator,
        scoring=None,
        n_jobs=None,
        *,
        position=0,
        allow_2d=False,
        verbose=None,
    ):
        _check_estimator(base_estimator)
        self.base_estimator = base_estimator
        self.n_jobs = n_jobs
        self.scoring = scoring
        self.position = position
        self.allow_2d = allow_2d
        self.verbose = verbose

    def _more_tags(self):
        return {"no_validation": True, "requires_fit": False}

    @property
    def _estimator_type(self):
        return getattr(self.base_estimator, "_estimator_type", None)

    def __repr__(self):  # noqa: D105
        repr_str = "<" + super().__repr__()
        if hasattr(self, "estimators_"):
            repr_str = repr_str[:-1]
            repr_str += f", fitted with {len(self.estimators_)} estimators"
        return repr_str + ">"

    def fit(self, X, y, **fit_params):
        """Fit a series of independent estimators to the dataset.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_tasks)
            The training input samples. For each data slice, a clone estimator
            is fitted independently. The feature dimension can be
            multidimensional e.g.
            X.shape = (n_samples, n_features_1, n_features_2, n_tasks).
        y : array, shape (n_samples,) | (n_samples, n_targets)
            The target values.
        **fit_params : dict of string -> object
            Parameters to pass to the fit method of the estimator.

        Returns
        -------
        self : object
            Return self.
        """
        X = self._check_Xy(X, y)
        parallel, p_func, n_jobs = parallel_func(
            _sl_fit, self.n_jobs, max_jobs=X.shape[-1], verbose=False
        )
        self.estimators_ = list()
        self.fit_params_ = fit_params

        # For fitting, the parallelization is across estimators.
        context = _create_progressbar_context(self, X, "Fitting")
        with context as pb:
            estimators = parallel(
                p_func(self.base_estimator, split, y, pb.subset(pb_idx), **fit_params)
                for pb_idx, split in array_split_idx(X, n_jobs, axis=-1)
            )

        # Each parallel job can have a different number of training estimators
        # We can't directly concatenate them because of sklearn's Bagging API
        # (see scikit-learn #9720)
        self.estimators_ = np.empty(X.shape[-1], dtype=object)
        idx = 0
        for job_estimators in estimators:
            for est in job_estimators:
                self.estimators_[idx] = est
                idx += 1
        return self

    def fit_transform(self, X, y, **fit_params):
        """Fit and transform a series of independent estimators to the dataset.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_tasks)
            The training input samples. For each task, a clone estimator
            is fitted independently. The feature dimension can be
            multidimensional, e.g.::

                X.shape = (n_samples, n_features_1, n_features_2, n_estimators)
        y : array, shape (n_samples,) | (n_samples, n_targets)
            The target values.
        **fit_params : dict of string -> object
            Parameters to pass to the fit method of the estimator.

        Returns
        -------
        y_pred : array, shape (n_samples, n_tasks) | (n_samples, n_tasks, n_targets)
            The predicted values for each estimator.
        """  # noqa: E501
        return self.fit(X, y, **fit_params).transform(X)

    def _transform(self, X, method):
        """Aux. function to make parallel predictions/transformation."""
        X = self._check_Xy(X)
        method = _check_method(self.base_estimator, method)
        if X.shape[-1] != len(self.estimators_):
            raise ValueError("The number of estimators does not match X.shape[-1]")
        # For predictions/transforms the parallelization is across the data and
        # not across the estimators to avoid memory load.
        parallel, p_func, n_jobs = parallel_func(
            _sl_transform, self.n_jobs, max_jobs=X.shape[-1], verbose=False
        )

        X_splits = np.array_split(X, n_jobs, axis=-1)
        idx, est_splits = zip(*array_split_idx(self.estimators_, n_jobs))

        context = _create_progressbar_context(self, X, "Transforming")
        with context as pb:
            y_pred = parallel(
                p_func(est, x, method, pb.subset(pb_idx))
                for pb_idx, est, x in zip(idx, est_splits, X_splits)
            )

        y_pred = np.concatenate(y_pred, axis=1)
        return y_pred

    def transform(self, X):
        """Transform each data slice/task with a series of independent estimators.

        The number of tasks in X should match the number of tasks/estimators
        given at fit time.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_tasks)
            The input samples. For each data slice/task, the corresponding
            estimator makes a transformation of the data, e.g.
            ``[estimators[ii].transform(X[..., ii]) for ii in range(n_estimators)]``.
            The feature dimension can be multidimensional e.g.
            X.shape = (n_samples, n_features_1, n_features_2, n_tasks).

        Returns
        -------
        Xt : array, shape (n_samples, n_estimators)
            The transformed values generated by each estimator.
        """  # noqa: E501
        return self._transform(X, "transform").astype(X.dtype)

    def predict(self, X):
        """Predict each data slice/task with a series of independent estimators.

        The number of tasks in X should match the number of tasks/estimators
        given at fit time.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_tasks)
            The input samples. For each data slice, the corresponding estimator
            makes the sample predictions, e.g.:
            ``[estimators[ii].predict(X[..., ii]) for ii in range(n_estimators)]``.
            The feature dimension can be multidimensional e.g.
            X.shape = (n_samples, n_features_1, n_features_2, n_tasks).

        Returns
        -------
        y_pred : array, shape (n_samples, n_estimators) | (n_samples, n_tasks, n_targets)
            Predicted values for each estimator/data slice.
        """  # noqa: E501
        return self._transform(X, "predict")

    def predict_proba(self, X):
        """Predict each data slice with a series of independent estimators.

        The number of tasks in X should match the number of tasks/estimators
        given at fit time.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_tasks)
            The input samples. For each data slice, the corresponding estimator
            makes the sample probabilistic predictions, e.g.:
            ``[estimators[ii].predict_proba(X[..., ii]) for ii in range(n_estimators)]``.
            The feature dimension can be multidimensional e.g.
            X.shape = (n_samples, n_features_1, n_features_2, n_tasks).

        Returns
        -------
        y_pred : array, shape (n_samples, n_tasks, n_classes)
            Predicted probabilities for each estimator/data slice/task.
        """  # noqa: E501
        return self._transform(X, "predict_proba")

    def decision_function(self, X):
        """Estimate distances of each data slice to the hyperplanes.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_tasks)
            The input samples. For each data slice, the corresponding estimator
            outputs the distance to the hyperplane, e.g.:
            ``[estimators[ii].decision_function(X[..., ii]) for ii in range(n_estimators)]``.
            The feature dimension can be multidimensional e.g.
            X.shape = (n_samples, n_features_1, n_features_2, n_estimators).

        Returns
        -------
        y_pred : array, shape (n_samples, n_estimators, n_classes * (n_classes-1) // 2)
            Predicted distances for each estimator/data slice.

        Notes
        -----
        This requires base_estimator to have a ``decision_function`` method.
        """  # noqa: E501
        return self._transform(X, "decision_function")

    def _check_Xy(self, X, y=None):
        """Aux. function to check input data."""
        # Once we require sklearn 1.1+ we should do something like:
        # from sklearn.utils import check_array
        # X = check_array(X, ensure_2d=False, input_name="X")
        # y = check_array(y, dtype=None, ensure_2d=False, input_name="y")
        if issparse(X):
            raise TypeError("X should be a dense array, got sparse instead.")
        X = np.asarray(X)
        if y is not None:
            y = np.asarray(y)
            if len(X) != len(y) or len(y) < 1:
                raise ValueError("X and y must have the same length.")
        if X.ndim < 3:
            err = None
            if not self.allow_2d:
                err = 3
            elif X.ndim < 2:
                err = 2
            if err:
                raise ValueError(f"X must have at least {err} dimensions.")
            X = X[..., np.newaxis]
        return X

    def score(self, X, y):
        """Score each estimator on each task.

        The number of tasks in X should match the number of tasks/estimators
        given at fit time, i.e. we need
        ``X.shape[-1] == len(self.estimators_)``.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_tasks)
            The input samples. For each data slice, the corresponding estimator
            scores the prediction, e.g.:
            ``[estimators[ii].score(X[..., ii], y) for ii in range(n_estimators)]``.
            The feature dimension can be multidimensional e.g.
            X.shape = (n_samples, n_features_1, n_features_2, n_tasks).
        y : array, shape (n_samples,) | (n_samples, n_targets)
            The target values.

        Returns
        -------
        score : array, shape (n_samples, n_estimators)
            Score for each estimator/task.
        """  # noqa: E501
        check_scoring = _get_check_scoring()

        X = self._check_Xy(X, y)
        if X.shape[-1] != len(self.estimators_):
            raise ValueError("The number of estimators does not match X.shape[-1]")

        scoring = check_scoring(self.base_estimator, self.scoring)
        y = _fix_auc(scoring, y)

        # For predictions/transforms the parallelization is across the data and
        # not across the estimators to avoid memory load.
        parallel, p_func, n_jobs = parallel_func(
            _sl_score, self.n_jobs, max_jobs=X.shape[-1], verbose=False
        )
        X_splits = np.array_split(X, n_jobs, axis=-1)
        est_splits = np.array_split(self.estimators_, n_jobs)
        score = parallel(
            p_func(est, scoring, x, y) for (est, x) in zip(est_splits, X_splits)
        )

        score = np.concatenate(score, axis=0)
        return score

    @property
    def classes_(self):
        if not hasattr(self.estimators_[0], "classes_"):
            raise AttributeError(
                "classes_ attribute available only if base_estimator has it, and "
                f"estimator {self.estimators_[0]} does not"
            )
        return self.estimators_[0].classes_


@fill_doc
def _sl_fit(estimator, X, y, pb, **fit_params):
    """Aux. function to fit SlidingEstimator in parallel.

    Fit a clone estimator to each slice of data.

    Parameters
    ----------
    %(base_estimator)s
    X : array, shape (n_samples, nd_features, n_estimators)
        The target data. The feature dimension can be multidimensional e.g.
        X.shape = (n_samples, n_features_1, n_features_2, n_estimators)
    y : array, shape (n_sample, )
        The target values.
    pb : instance of ProgressBar
        The progress bar to update.
    fit_params : dict | None
        Parameters to pass to the fit method of the estimator.

    Returns
    -------
    estimators_ : list of estimators
        The fitted estimators.
    """
    from sklearn.base import clone

    estimators_ = list()
    for ii in range(X.shape[-1]):
        est = clone(estimator)
        est.fit(X[..., ii], y, **fit_params)
        estimators_.append(est)

        pb.update(ii + 1)
    return estimators_


def _sl_transform(estimators, X, method, pb):
    """Aux. function to transform SlidingEstimator in parallel.

    Applies transform/predict/decision_function etc for each slice of data.

    Parameters
    ----------
    estimators : list of estimators
        The fitted estimators.
    X : array, shape (n_samples, nd_features, n_estimators)
        The target data. The feature dimension can be multidimensional e.g.
        X.shape = (n_samples, n_features_1, n_features_2, n_estimators)
    method : str
        The estimator method to use (e.g. 'predict', 'transform').
    pb : instance of ProgressBar
        The progress bar to update.

    Returns
    -------
    y_pred : array, shape (n_samples, n_estimators, n_classes * (n_classes-1) // 2)
        The transformations for each slice of data.
    """  # noqa: E501
    for ii, est in enumerate(estimators):
        transform = getattr(est, method)
        _y_pred = transform(X[..., ii])
        # Initialize array of predictions on the first transform iteration
        if ii == 0:
            y_pred = _sl_init_pred(_y_pred, X)
        y_pred[:, ii, ...] = _y_pred

        pb.update(ii + 1)
    return y_pred


def _sl_init_pred(y_pred, X):
    """Aux. function to SlidingEstimator to initialize y_pred."""
    n_sample, n_tasks = X.shape[0], X.shape[-1]
    y_pred = np.zeros((n_sample, n_tasks) + y_pred.shape[1:], y_pred.dtype)
    return y_pred


def _sl_score(estimators, scoring, X, y):
    """Aux. function to score SlidingEstimator in parallel.

    Predict and score each slice of data.

    Parameters
    ----------
    estimators : list, shape (n_tasks,)
        The fitted estimators.
    X : array, shape (n_samples, nd_features, n_tasks)
        The target data. The feature dimension can be multidimensional e.g.
        X.shape = (n_samples, n_features_1, n_features_2, n_tasks)
    scoring : callable, str or None
        If scoring is None (default), the predictions are internally
        generated by estimator.score(). Else, we must first get the
        predictions to pass them to ad-hoc scorer.
    y : array, shape (n_samples,) | (n_samples, n_targets)
        The target values.

    Returns
    -------
    score : array, shape (n_tasks,)
        The score for each task / slice of data.
    """
    n_tasks = X.shape[-1]
    score = np.zeros(n_tasks)
    for ii, est in enumerate(estimators):
        score[ii] = scoring(est, X[..., ii], y)
    return score


def _check_method(estimator, method):
    """Check that an estimator has the method attribute.

    If method == 'transform'  and estimator does not have 'transform', use
    'predict' instead.
    """
    if method == "transform" and not hasattr(estimator, "transform"):
        method = "predict"
    if not hasattr(estimator, method):
        ValueError(f"base_estimator does not have `{method}` method.")
    return method


@fill_doc
class GeneralizingEstimator(SlidingEstimator):
    """Generalization Light.

    Fit a search-light along the last dimension and use them to apply a
    systematic cross-tasks generalization.

    Parameters
    ----------
    %(base_estimator)s
    %(scoring)s
    %(n_jobs)s
    %(position)s
    %(allow_2d)s
    %(verbose)s
    """

    def __repr__(self):  # noqa: D105
        repr_str = super().__repr__()
        if hasattr(self, "estimators_"):
            repr_str = repr_str[:-1]
            repr_str += f", fitted with {len(self.estimators_)} estimators>"
        return repr_str

    def _transform(self, X, method):
        """Aux. function to make parallel predictions/transformation."""
        X = self._check_Xy(X)
        method = _check_method(self.base_estimator, method)

        parallel, p_func, n_jobs = parallel_func(
            _gl_transform, self.n_jobs, max_jobs=X.shape[-1], verbose=False
        )

        context = _create_progressbar_context(self, X, "Transforming")
        with context as pb:
            y_pred = parallel(
                p_func(self.estimators_, x_split, method, pb.subset(pb_idx))
                for pb_idx, x_split in array_split_idx(
                    X, n_jobs, axis=-1, n_per_split=len(self.estimators_)
                )
            )

        y_pred = np.concatenate(y_pred, axis=2)
        return y_pred

    def transform(self, X):
        """Transform each data slice with all possible estimators.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_slices)
            The input samples. For estimator the corresponding data slice is
            used to make a transformation. The feature dimension can be
            multidimensional e.g.
            X.shape = (n_samples, n_features_1, n_features_2, n_estimators).

        Returns
        -------
        Xt : array, shape (n_samples, n_estimators, n_slices)
            The transformed values generated by each estimator.
        """
        return self._transform(X, "transform")

    def predict(self, X):
        """Predict each data slice with all possible estimators.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_slices)
            The training input samples. For each data slice, a fitted estimator
            predicts each slice of the data independently. The feature
            dimension can be multidimensional e.g.
            X.shape = (n_samples, n_features_1, n_features_2, n_estimators).

        Returns
        -------
        y_pred : array, shape (n_samples, n_estimators, n_slices) | (n_samples, n_estimators, n_slices, n_targets)
            The predicted values for each estimator.
        """  # noqa: E501
        return self._transform(X, "predict")

    def predict_proba(self, X):
        """Estimate probabilistic estimates of each data slice with all possible estimators.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_slices)
            The training input samples. For each data slice, a fitted estimator
            predicts a slice of the data. The feature dimension can be
            multidimensional e.g.
            ``X.shape = (n_samples, n_features_1, n_features_2, n_estimators)``.

        Returns
        -------
        y_pred : array, shape (n_samples, n_estimators, n_slices, n_classes)
            The predicted values for each estimator.

        Notes
        -----
        This requires ``base_estimator`` to have a ``predict_proba`` method.
        """  # noqa: E501
        return self._transform(X, "predict_proba")

    def decision_function(self, X):
        """Estimate distances of each data slice to all hyperplanes.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_slices)
            The training input samples. Each estimator outputs the distance to
            its hyperplane, e.g.:
            ``[estimators[ii].decision_function(X[..., ii]) for ii in range(n_estimators)]``.
            The feature dimension can be multidimensional e.g.
            ``X.shape = (n_samples, n_features_1, n_features_2, n_estimators)``.

        Returns
        -------
        y_pred : array, shape (n_samples, n_estimators, n_slices, n_classes * (n_classes-1) // 2)
            The predicted values for each estimator.

        Notes
        -----
        This requires ``base_estimator`` to have a ``decision_function``
        method.
        """  # noqa: E501
        return self._transform(X, "decision_function")

    def score(self, X, y):
        """Score each of the estimators on the tested dimensions.

        Parameters
        ----------
        X : array, shape (n_samples, nd_features, n_slices)
            The input samples. For each data slice, the corresponding estimator
            scores the prediction, e.g.:
            ``[estimators[ii].score(X[..., ii], y) for ii in range(n_slices)]``.
            The feature dimension can be multidimensional e.g.
            ``X.shape = (n_samples, n_features_1, n_features_2, n_estimators)``.
        y : array, shape (n_samples,) | (n_samples, n_targets)
            The target values.

        Returns
        -------
        score : array, shape (n_samples, n_estimators, n_slices)
            Score for each estimator / data slice couple.
        """  # noqa: E501
        check_scoring = _get_check_scoring()
        X = self._check_Xy(X, y)
        # For predictions/transforms the parallelization is across the data and
        # not across the estimators to avoid memory load.
        parallel, p_func, n_jobs = parallel_func(
            _gl_score, self.n_jobs, max_jobs=X.shape[-1], verbose=False
        )
        scoring = check_scoring(self.base_estimator, self.scoring)
        y = _fix_auc(scoring, y)

        context = _create_progressbar_context(self, X, "Scoring")
        with context as pb:
            score = parallel(
                p_func(self.estimators_, scoring, x, y, pb.subset(pb_idx))
                for pb_idx, x in array_split_idx(
                    X, n_jobs, axis=-1, n_per_split=len(self.estimators_)
                )
            )

        score = np.concatenate(score, axis=1)
        return score


def _gl_transform(estimators, X, method, pb):
    """Transform the dataset.

    This will apply each estimator to all slices of the data.

    Parameters
    ----------
    X : array, shape (n_samples, nd_features, n_slices)
        The training input samples. For each data slice, a clone estimator
        is fitted independently. The feature dimension can be multidimensional
        e.g. X.shape = (n_samples, n_features_1, n_features_2, n_estimators)
    method : str
        The method to call for each estimator.
    pb : instance of ProgressBar
        The progress bar to update.

    Returns
    -------
    Xt : array, shape (n_samples, n_slices)
        The transformed values generated by each estimator.
    """
    n_sample, n_iter = X.shape[0], X.shape[-1]
    for ii, est in enumerate(estimators):
        # stack generalized data for faster prediction
        X_stack = X.transpose(np.r_[0, X.ndim - 1, range(1, X.ndim - 1)])
        X_stack = X_stack.reshape(np.r_[n_sample * n_iter, X_stack.shape[2:]])
        transform = getattr(est, method)
        _y_pred = transform(X_stack)
        # unstack generalizations
        if _y_pred.ndim == 2:
            _y_pred = np.reshape(_y_pred, [n_sample, n_iter, _y_pred.shape[1]])
        else:
            shape = np.r_[n_sample, n_iter, _y_pred.shape[1:]].astype(int)
            _y_pred = np.reshape(_y_pred, shape)
        # Initialize array of predictions on the first transform iteration
        if ii == 0:
            y_pred = _gl_init_pred(_y_pred, X, len(estimators))
        y_pred[:, ii, ...] = _y_pred

        pb.update((ii + 1) * n_iter)
    return y_pred


def _gl_init_pred(y_pred, X, n_train):
    """Aux. function to GeneralizingEstimator to initialize y_pred."""
    n_sample, n_iter = X.shape[0], X.shape[-1]
    if y_pred.ndim == 3:
        y_pred = np.zeros((n_sample, n_train, n_iter, y_pred.shape[-1]), y_pred.dtype)
    else:
        y_pred = np.zeros((n_sample, n_train, n_iter), y_pred.dtype)
    return y_pred


def _gl_score(estimators, scoring, X, y, pb):
    """Score GeneralizingEstimator in parallel.

    Predict and score each slice of data.

    Parameters
    ----------
    estimators : list of estimators
        The fitted estimators.
    scoring : callable, string or None
        If scoring is None (default), the predictions are internally
        generated by estimator.score(). Else, we must first get the
        predictions to pass them to ad-hoc scorer.
    X : array, shape (n_samples, nd_features, n_slices)
        The target data. The feature dimension can be multidimensional e.g.
        X.shape = (n_samples, n_features_1, n_features_2, n_estimators)
    y : array, shape (n_samples,) | (n_samples, n_targets)
        The target values.
    pb : instance of ProgressBar
        The progress bar to update.

    Returns
    -------
    score : array, shape (n_estimators, n_slices)
        The score for each slice of data.
    """
    # FIXME: The level parallelization may be a bit high, and might be memory
    # consuming. Perhaps need to lower it down to the loop across X slices.
    score_shape = [len(estimators), X.shape[-1]]
    for jj in range(X.shape[-1]):
        for ii, est in enumerate(estimators):
            _score = scoring(est, X[..., jj], y)
            # Initialize array of predictions on the first score iteration
            if (ii == 0) and (jj == 0):
                dtype = type(_score)
                score = np.zeros(score_shape, dtype)
            score[ii, jj, ...] = _score

            pb.update(jj * len(estimators) + ii + 1)
    return score


def _fix_auc(scoring, y):
    from sklearn.preprocessing import LabelEncoder

    # This fixes sklearn's inability to compute roc_auc when y not in [0, 1]
    # scikit-learn/scikit-learn#6874
    if scoring is not None:
        score_func = getattr(scoring, "_score_func", None)
        kwargs = getattr(scoring, "_kwargs", {})
        if (
            getattr(score_func, "__name__", "") == "roc_auc_score"
            and kwargs.get("multi_class", "raise") == "raise"
        ):
            if np.ndim(y) != 1 or len(set(y)) != 2:
                raise ValueError(
                    "roc_auc scoring can only be computed for two-class problems."
                )
            y = LabelEncoder().fit_transform(y)
    return y


def _create_progressbar_context(inst, X, message):
    """Create a progress bar taking into account ``inst.verbose``."""
    multiply = len(inst.estimators_) if isinstance(inst, GeneralizingEstimator) else 1
    n_steps = X.shape[-1] * max(1, multiply)
    mesg = f"{message} {inst.__class__.__name__}"

    which_tqdm = "off" if not _check_verbose(inst.verbose) else None
    context = ProgressBar(
        n_steps, mesg=mesg, position=inst.position, which_tqdm=which_tqdm
    )

    return context


def _check_verbose(verbose):
    """Check if verbose is above or equal 'INFO' level."""
    logging_level = _parse_verbose(verbose)
    bool_verbose = logging_level <= logging.INFO
    return bool_verbose