Source code for cca_zoo.models._cca_base

import itertools
from abc import abstractmethod
from typing import Union, Iterable

import numpy as np
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
from sklearn.utils.sparsefuncs import mean_variance_axis
from sklearn.utils.validation import check_random_state, check_is_fitted

from cca_zoo.utils.check_values import _check_views
from cca_zoo.utils.plotting import plot_latent_train_test


[docs]class _CCA_Base(BaseEstimator, MultiOutputMixin, RegressorMixin): """ A class used as the base for methods in the package. Allows methods to inherit fit_transform, predict_corr, and gridsearch_fit when only fit (and transform where it is different to the default) is provided. Attributes ---------- weights : list of weights for each view """ def __init__( self, latent_dims: int = 1, scale=True, centre=True, copy_data=True, accept_sparse=False, random_state: Union[int, np.random.RandomState] = None, ): """ Constructor for _CCA_Base :param latent_dims: number of latent dimensions to fit :param scale: normalize variance in each column before fitting :param centre: demean data by column before fitting (and before transforming out of sample :param copy_data: If True, X will be copied; else, it may be overwritten :param accept_sparse: Whether model can take sparse data as input :param random_state: Pass for reproducible output across multiple function calls """ self.latent_dims = latent_dims self.scale = scale self.centre = centre self.copy_data = copy_data self.accept_sparse = accept_sparse self.random_state = check_random_state(random_state) self.n_views = None
[docs] @abstractmethod def fit(self, views: Iterable[np.ndarray], y=None, **kwargs): """ Fits a given model :param views: list/tuple of numpy arrays or array likes with the same number of rows (samples) """ raise NotImplementedError
[docs] def transform(self, views: Iterable[np.ndarray], y=None, **kwargs): """ Transforms data given a fit model :param views: numpy arrays with the same number of rows (samples) separated by commas :param kwargs: any additional keyword arguments required by the given model """ check_is_fitted(self, attributes=["weights"]) views = _check_views( *views, copy=self.copy_data, accept_sparse=self.accept_sparse ) views = self._centre_scale_transform(views) transformed_views = [] for i, (view) in enumerate(views): transformed_view = view @ self.weights[i] transformed_views.append(transformed_view) return transformed_views
[docs] def fit_transform(self, views: Iterable[np.ndarray], y=None, **kwargs): """ Fits and then transforms the training data :param views: list/tuple of numpy arrays or array likes with the same number of rows (samples) :param kwargs: any additional keyword arguments required by the given model """ return self.fit(views, **kwargs).transform(views)
[docs] def get_loadings(self, views: Iterable[np.ndarray], y=None, **kwargs): """ Returns the model loadings for each view for the given data :param views: list/tuple of numpy arrays or array likes with the same number of rows (samples) :param kwargs: any additional keyword arguments required by the given model """ transformed_views = self.transform(views, **kwargs) views = self._centre_scale_transform(views) loadings = [ view.T @ transformed_view for view, transformed_view in zip(views, transformed_views) ] return loadings
[docs] def correlations(self, views: Iterable[np.ndarray], y=None, **kwargs): """ Predicts the correlation for the given data using the fit model :param views: list/tuple of numpy arrays or array likes with the same number of rows (samples) :param kwargs: any additional keyword arguments required by the given model :return: all_corrs: an array of the pairwise correlations (k,k,self.latent_dims) where k is the number of views :rtype: np.ndarray """ transformed_views = self.transform(views, **kwargs) all_corrs = [] for x, y in itertools.product(transformed_views, repeat=2): all_corrs.append( np.diag(np.corrcoef(x.T, y.T)[: self.latent_dims, self.latent_dims :]) ) all_corrs = np.array(all_corrs).reshape( (len(views), len(views), self.latent_dims) ) return all_corrs
def plot_latent( self, views: Iterable[np.ndarray], test_views: Iterable[np.ndarray] = None, title="", ): scores = self.transform(views) if test_views is not None: test_scores = self.transform(test_views) else: test_scores = None plot_latent_train_test(scores, test_scores, title=title)
[docs] def score(self, views: Iterable[np.ndarray], y=None, **kwargs): # by default return the average pairwise correlation in each dimension (for 2 views just the correlation) pair_corrs = self.correlations(views, **kwargs) # n views n_views = pair_corrs.shape[0] # sum all the pairwise correlations for each dimension. Subtract the self correlations. Divide by the number of views. Gives average correlation dim_corrs = ( pair_corrs.sum(axis=tuple(range(pair_corrs.ndim - 1))) - n_views ) / (n_views ** 2 - n_views) return dim_corrs
[docs] def _centre_scale(self, views: Iterable[np.ndarray]): """ Removes the mean of the training data and standardizes for each view and stores mean and standard deviation during training :param views: list/tuple of numpy arrays or array likes with the same number of rows (samples) :return: train_views: the demeaned numpy arrays to be used to fit the model """ self.view_means = [] self.view_stds = [] transformed_views = [] for view in views: if issparse(view): view_mean, view_std = mean_variance_axis(view, axis=0) self.view_means.append(view_mean) self.view_stds.append(view_std) view = view - self.view_means[-1] view = view / self.view_stds[-1] else: if self.centre: view_mean = view.mean(axis=0) self.view_means.append(view_mean) view = view - self.view_means[-1] if self.scale: view_std = view.std(axis=0, ddof=1) view_std[view_std == 0.0] = 1.0 self.view_stds.append(view_std) view = view / self.view_stds[-1] transformed_views.append(view) return transformed_views
[docs] def _centre_scale_transform(self, views: Iterable[np.ndarray]): """ Removes the mean and standardizes each view based on the mean and standard deviation of the training data :param views: list/tuple of numpy arrays or array likes with the same number of rows (samples) """ if self.centre: views = [view - mean for view, mean in zip(views, self.view_means)] if self.scale: views = [view / std for view, std in zip(views, self.view_stds)] return views