Source code for cca_zoo.model_selection._validation

import numpy as np
from sklearn import clone
from sklearn.base import is_classifier
from sklearn.metrics import check_scoring
from sklearn.model_selection import cross_validate as cross_validate_, check_cv
from sklearn.model_selection import learning_curve as learning_curve_
from sklearn.model_selection._validation import _permutation_test_score, _shuffle
from sklearn.pipeline import Pipeline
from sklearn.utils import indexable, check_random_state
from sklearn.utils.parallel import Parallel, delayed

from cca_zoo._utils._splitter import SimpleSplitter



[docs]
def cross_validate(
    estimator,
    views,
    y=None,
    *,
    groups=None,
    scoring=None,
    cv=None,
    n_jobs=None,
    verbose=0,
    fit_params=None,
    pre_dispatch="2*n_jobs",
    return_train_score=False,
    return_estimator=False,
    error_score=np.nan,
):
    """
    Evaluate metric(s) by cross-validation and also record fit/score times.

    Read more in the :ref:`User Guide <multimetric_cross_validation>`.

    Parameters
    ----------
    estimator : object
        Estimator object implementing 'fit'. The object to use to fit the data.
    views : list or tuple of array-like
        List or tuple of numpy arrays or array-likes with the same number of rows (samples).
    y : array-like of shape (n_samples,) or (n_samples, n_outputs), optional, default=None
        The target variable to try to predict in the case of supervised learning.
    groups : array-like of shape (n_samples,), optional, default=None
        Group labels for the samples used while splitting the dataset into train/test set.
        Only used in conjunction with a "Group" :term:`cv` instance (e.g., :class:`GroupKFold`).
    scoring : str, callable, list, tuple, or dict, optional, default=None
        Strategy to evaluate the performance of the cross-validated model on the test set.
        See notes below for more detail.
    cv : int, cross-validation generator or an iterable, optional, default=None
        Determines the cross-validation splitting strategy. See notes below for more detail.
    n_jobs : int, optional, default=None
        Number of jobs to run in parallel.
    verbose : int, default=0
        The verbosity level.
    fit_params : dict, optional, default=None
        Parameters to pass to the fit method of the estimator.
    pre_dispatch : int or str, default='2*n_jobs'
        Controls the number of jobs that get dispatched during parallel execution. See notes below for more detail.

    Notes
    -----
    For `scoring`:
    If `scoring` represents a single score, one can use:
        - a single string (see :ref:`scoring_parameter`);
        - a callable (see :ref:`scoring`) that returns a single value.
    If `scoring` represents multiple scores, one can use:
        - a list or tuple of unique strings;
        - a callable returning a dictionary where the keys are the metric
          names and the values are the metric scores;
        - a dictionary with metric names as keys and callables a values.
    See :ref:`multimetric_grid_search` for an example.

    For `cv`:
    Possible inputs for cv are:
        - None, to use the default 5-fold cross validation,
        - int, to specify the number of folds in a `(Stratified)KFold`,
        - :term:`CV splitter`,
        - An iterable yielding (train, test) splits as arrays of indices.
    For int/None inputs, if the estimator is a classifier and ``y`` is
    either binary or multiclass, :class:`StratifiedKFold` is used. In all
    other cases, :class:`.Fold` is used. These splitters are instantiated
    with `shuffle=False` so the splits will be the same across calls.
    Refer :ref:`User Guide <cross_validation>` for the various
    cross-validation strategies that can be used here.

    For `pre_dispatch`:
    This parameter can be:
        - None, in which case all the jobs are immediately
          created and spawned. Use this for lightweight and
          fast-running jobs, to avoid delays due to on-demand
          spawning of the jobs
        - An int, giving the exact number of total jobs that are
          spawned
        - A str, giving an expression as a function of n_jobs,
          as in '2*n_jobs'
    """

    estimator = Pipeline(
        [
            ("splitter", SimpleSplitter([view.shape[1] for view in views])),
            ("estimator", clone(estimator)),
        ]
    )
    ret = cross_validate_(
        estimator,
        np.hstack(views),
        y=y,
        groups=groups,
        scoring=scoring,
        cv=cv,
        n_jobs=n_jobs,
        verbose=verbose,
        fit_params=fit_params,
        pre_dispatch=pre_dispatch,
        return_train_score=return_train_score,
        return_estimator=return_estimator,
        error_score=error_score,
    )
    if return_estimator:
        ret["estimator"] = [estimator["estimator"] for estimator in ret["estimator"]]
    return ret




[docs]
def permutation_test_score(
    estimator,
    views,
    y=None,
    groups=None,
    cv=None,
    n_permutations=100,
    n_jobs=None,
    random_state=0,
    verbose=0,
    scoring=None,
    fit_params=None,
):
    """
    Evaluate the significance of a cross-validated score with permutations.

    Permutes targets to generate 'randomized data' and compute the empirical
    p-value against the null hypothesis that features and targets are
    independent. A small p-value suggests that there is a real dependency between
    features and targets which has been used by the estimator to give good predictions.
    A large p-value may be due to lack of real dependency between features and targets
    or the estimator was not able to use the dependency to give good predictions.

    Read more in the :ref:`User Guide <permutation_test_score>`.

    Parameters
    ----------
    estimator : object
        Estimator object implementing 'fit'. The object to use to fit the data.
    views : list or tuple of array-like
        List or tuple of numpy arrays or array-likes with the same number of rows (samples).
    y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None, optional
        The target variable to try to predict in the case of supervised learning.
    groups : array-like of shape (n_samples,), optional, default=None
        Labels to constrain permutation within groups. When not specified, ``y`` values
        are permuted among all samples. When a grouped cross-validator is used, the
        group labels are also passed on to the ``split`` method of the cross-validator.
    scoring : str or callable, optional, default=None
        A single string (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`)
        to evaluate the predictions on the test set. If `None` the estimator's score method is used.
    cv : int, cross-validation generator or an iterable, optional, default=None
        Determines the cross-validation splitting strategy. See notes below for more detail.
    n_permutations : int, default=100
        Number of times to permute ``y``.
    n_jobs : int, optional, default=None
        Number of jobs to run in parallel.
    random_state : int, RandomState instance or None, default=0
        Pass an int for reproducible output for permutation of ``y`` values among samples.
    verbose : int, default=0
        The verbosity level.
    fit_params : dict, optional, default=None
        Parameters to pass to the fit method of the estimator.

    Notes
    -----
    For `cv`:
        Possible inputs for cv are:
        - `None`, to use the default 5-fold cross validation,
        - int, to specify the number of folds in a `(Stratified)KFold`,
        - :term:`CV splitter`,
        - An iterable yielding (train, test) splits as arrays of indices.
        For `int`/`None` inputs, if the estimator is a classifier and `y` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used. These splitters are instantiated
        with `shuffle=False` so the splits will be the same across calls.
        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.
    """
    estimator = Pipeline(
        [
            ("splitter", SimpleSplitter([view.shape[1] for view in views])),
            ("estimator", clone(estimator)),
        ]
    )

    if y is None:
        y = np.zeros(views[0].shape[0])
    X = np.hstack(views)
    X, y, groups = indexable(X, y, groups)
    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)
    random_state = check_random_state(random_state)

    # We clone the estimator to make sure that all the folds are
    # independent, and that it is pickle-able.
    score = _permutation_test_score(
        clone(estimator), X, y, groups, cv, scorer, fit_params=fit_params
    )
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_permutation_test_score)(
            clone(estimator),
            np.hstack((_shuffle(views[0], groups, random_state), *views[1:])),
            y,
            groups,
            cv,
            scorer,
            fit_params=fit_params,
        )
        for _ in range(n_permutations)
    )
    permutation_scores = np.array(permutation_scores)
    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
    return score, permutation_scores, pvalue




[docs]
def learning_curve(
    estimator,
    X,
    y=None,
    groups=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
    cv=None,
    scoring=None,
    exploit_incremental_learning=False,
    n_jobs=None,
    pre_dispatch="all",
    verbose=0,
    shuffle=False,
    random_state=None,
    error_score=np.nan,
    return_times=False,
    fit_params=None,
):
    """
    Learning Curve.

    Determines cross-validated training and test scores for different training
    set sizes. A cross-validation generator splits the whole dataset k times in
    training and test data. Subsets of the training set with varying sizes will
    be used to train the estimator and a score for each training subset size and
    the test set will be computed. Afterwards, the scores will be averaged over
    all k runs for each training subset size.

    Read more in the :ref:`User Guide <learning_curve>`.

    Parameters
    ----------
    estimator : object
        An object type that implements the "fit" and "predict" methods. An object
        of this type is cloned for each validation.

    representations : list or tuple of numpy arrays or array-likes
        Input data as a list or tuple of numpy arrays or array-likes with the
        same number of rows (samples).

    y : array-like of shape (n_samples,) or (n_samples, n_outputs), optional
        Target relative to representations for classification or regression;
        None for unsupervised learning.

    groups : array-like of shape (n_samples,), default=None
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used in conjunction with a "Group" cv instance
        (e.g., GroupKFold).

    train_sizes : array-like of shape (n_ticks,), default=np.linspace(0.1, 1.0, 5)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e., it has to be within (0, 1].
        Otherwise, it is interpreted as absolute sizes of the training sets.
        Note that for classification, the number of samples usually has to
        be big enough to contain at least one sample from each class.

    cv : int, cross-validation generator, or an iterable, default=None
        Determines the cross-validation splitting strategy. Possible inputs for cv are:
        - None, to use the default 5-fold cross-validation,
        - int, to specify the number of folds in a (Stratified)KFold,
        - CV splitter,
        - An iterable yielding (train, test) splits as arrays of indices.
        For int/None inputs, if the estimator is a classifier and "y" is
        either binary or multiclass, StratifiedKFold is used. In all other cases,
        KFold is used. These splitters are instantiated with shuffle=False, so
        the splits will be the same across calls. Refer to the
        :ref:`User Guide <cross_validation>` for the various
        cross-validation strategies that can be used here.

    scoring : str or callable, default=None
        A str (see model evaluation documentation) or a scorer callable object /
        function with signature "scorer(estimator, representations, y)".

    exploit_incremental_learning : bool, default=False
        If the estimator supports incremental learning, this will be used to
        speed up fitting for different training set sizes.

    n_jobs : int, default=None
        Number of jobs to run in parallel. Training the estimator and computing
        the score are parallelized over the different training and test sets.
        None means 1 unless in a joblib.parallel_backend context. -1 means
        using all processors. See the Glossary for more details.

    pre_dispatch : int or str, default='all'
        Number of predispatched jobs for parallel execution (default is all).
        The option can reduce the allocated memory. The str can be an
        expression like '2*n_jobs'.

    verbose : int, default=0
        Controls the verbosity: the higher, the more messages.

    shuffle : bool, default=False
        Whether to shuffle training data before taking prefixes of it
        based on "train_sizes".

    random_state : int, RandomState instance, or None, default=None
        Used when "shuffle" is True. Pass an int for reproducible
        output across multiple function calls. See the Glossary for more
        details.

    error_score : 'raise' or numeric, default=np.nan
        Value to assign to the score if an error occurs in estimator fitting.
        If set to 'raise', the error is raised. If a numeric value is given,
        FitFailedWarning is raised.

    return_times : bool, default=False
        Whether to return the fit and score times.

    fit_params : dict, default=None
        Parameters to pass to the fit method of the estimator.

    Returns
    -------
    train_sizes_abs : array, shape (n_unique_ticks,)
        Numbers of training examples that have been used to generate the
        learning curve.

    train_scores : array, shape (n_ticks, n_cv_folds)
        Scores on training sets.

    test_scores : array, shape (n_ticks, n_cv_folds)
        Scores on test set.

    fit_times : array, shape (n_ticks, n_cv_folds)
        Times spent for fitting in seconds. Only present if `return_times`
        is True.

    score_times : array, shape (n_ticks, n_cv_folds)
        Times spent for scoring in seconds. Only present if `return_times`
        is True.

    See Also
    --------
    sklearn.model_selection.learning_curve : The function to create the learning
        curve.
    """

    estimator = Pipeline(
        [
            ("splitter", SimpleSplitter([X_.shape[1] for X_ in X])),
            ("estimator", clone(estimator)),
        ]
    )
    return learning_curve_(
        estimator,
        np.hstack(X),
        y,
        groups=groups,
        train_sizes=train_sizes,
        cv=cv,
        scoring=scoring,
        exploit_incremental_learning=exploit_incremental_learning,
        n_jobs=n_jobs,
        pre_dispatch=pre_dispatch,
        verbose=verbose,
        shuffle=shuffle,
        random_state=random_state,
        error_score=error_score,
        return_times=return_times,
        fit_params=fit_params,
    )