vecstack
Advanced tools
| """Scikit-learn compatible API for stacking. | ||
| Find out how to use: | ||
| >>> from vecstack import StackingTransformer | ||
| >>> help(StackingTransformer) | ||
| MIT License | ||
| Copyright (c) 2016-2018 Igor Ivanov | ||
| Email: vecxoz@gmail.com | ||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| of this software and associated documentation files (the "Software"), to deal | ||
| in the Software without restriction, including without limitation the rights | ||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| copies of the Software, and to permit persons to whom the Software is | ||
| furnished to do so, subject to the following conditions: | ||
| The above copyright notice and this permission notice shall be included in all | ||
| copies or substantial portions of the Software. | ||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| SOFTWARE. | ||
| """ | ||
| # ----------------------------------------------------------------------------- | ||
| # ----------------------------------------------------------------------------- | ||
| from __future__ import print_function | ||
| from __future__ import division | ||
| # ----------------------------------------------------------------------------- | ||
| # ----------------------------------------------------------------------------- | ||
| import warnings | ||
| import numpy as np | ||
| import scipy.stats as st | ||
| from sklearn.base import BaseEstimator | ||
| from sklearn.base import TransformerMixin | ||
| from sklearn.base import clone | ||
| from sklearn.dummy import DummyClassifier | ||
| from sklearn.dummy import DummyRegressor | ||
| from sklearn.utils.validation import check_X_y | ||
| from sklearn.utils.validation import check_array | ||
| from sklearn.utils.validation import check_is_fitted | ||
| from sklearn.utils.validation import has_fit_parameter | ||
| from sklearn.model_selection import KFold | ||
| from sklearn.model_selection import StratifiedKFold | ||
| from sklearn.metrics import mean_absolute_error | ||
| from sklearn.metrics import accuracy_score | ||
| from sklearn.metrics import log_loss | ||
| from sklearn.externals import six | ||
| # ----------------------------------------------------------------------------- | ||
| # ----------------------------------------------------------------------------- | ||
| class StackingTransformer(BaseEstimator, TransformerMixin): | ||
| """StackingTransformer. Scikit-learn compatible API for stacking. | ||
| Parameters | ||
| ---------- | ||
| estimators : list of tuples, default None | ||
| Base level estimators. | ||
| If None then by default: | ||
| DummyRegressor (predicts constant 5.5) - for regression task | ||
| DummyClassifier (predicts constant 1) - for classification task | ||
| You can use any sklearn-like estimators. | ||
| Each tuple in the list contains arbitrary | ||
| unique name and estimator object, e.g.: | ||
| estimators = [('lr', LinearRegression()), | ||
| ('ridge', Ridge(random_state=0))] | ||
| Note. According to sklearn convention for binary classification | ||
| task with probabilities estimator must return probabilities | ||
| for each class (i.e. two columns). | ||
| regression : boolean, default True | ||
| If True - perform stacking for regression task, | ||
| if False - perform stacking for classification task | ||
| transform_target : callable, default None | ||
| Function to transform target variable. | ||
| If None - transformation is not used. | ||
| For example, for regression task (if target variable is skewed) | ||
| you can use transformation like ``numpy.log1p``. | ||
| Set ``transform_target=numpy.log1p`` | ||
| Usually you want to use respective backward transformation | ||
| for prediction like ``numpy.expm1``. | ||
| Set ``transform_pred=numpy.expm1`` | ||
| Caution! Some transformations may give inapplicable results. | ||
| For example, if target variable contains zeros, numpy.log | ||
| gives you -inf. In such case you can use appropriate | ||
| transformation like ``numpy.log1p`` and respective | ||
| backward transformation like ``numpy.expm1`` | ||
| transform_pred : callable, default None | ||
| Function to transform prediction. | ||
| If None - transformation is not used. | ||
| If you use transformation for target variable (``transform_target``) | ||
| like ``numpy.log1p``, then using ``transform_pred`` you can specify | ||
| respective backward transformation like ``numpy.expm1``. | ||
| Look at description of parameter ``transform_target`` | ||
| variant: str, default 'A' | ||
| Possible values: 'A', 'B'. | ||
| This parameter influences test set predictions only. | ||
| Variant 'A' - predict test set in each fold and find mean (mode) | ||
| Variant 'B' - fit on full train set and predict test set once | ||
| See tutorial for details: | ||
| https://github.com/vecxoz/vecstack/blob/master/ \ | ||
| examples/00_stacking_concept_pictures_code.ipynb | ||
| needs_proba: boolean, default False, meaningful only for classification | ||
| Whether to predict probabilities (instead of class labels) | ||
| in classification task. | ||
| Ignored if ``regression=True`` | ||
| metric : callable, default None | ||
| Evaluation metric (score function) which is used to calculate | ||
| cross-validation scores. | ||
| If None, then by default: | ||
| sklearn.metrics.mean_absolute_error - for regression | ||
| sklearn.metrics.accuracy_score - for classification | ||
| with class labels | ||
| sklearn.metrics.log_loss - for classification with probabilities | ||
| You can use any appropriate sklearn metric or | ||
| define your own metric like shown below: | ||
| def your_metric(y_true, y_pred): | ||
| # calculate | ||
| return result | ||
| n_folds : int, default 4 | ||
| Number of folds in cross-validation | ||
| stratified : boolean, default False, meaningful only for classification | ||
| If True - use stratified folds in cross-validation | ||
| Ignored if ``regression=True`` | ||
| shuffle : boolean, default False | ||
| Whether to perform a shuffle before cross-validation split | ||
| random_state : int, default 0 | ||
| Random seed used to initiate fold split. | ||
| Same seed and correspondingly same split is used for all estimators. | ||
| verbose : int, default 0 | ||
| Level of verbosity. | ||
| 0 - show no messages | ||
| 1 - for each estimator show mean score | ||
| 2 - for each estimator show score for each fold and mean score | ||
| Attributes | ||
| ---------- | ||
| estimators_ : list | ||
| List of base estimators (not fitted) passed by user (or default) | ||
| n_estimators_ : int | ||
| Number of base estimators passed by user (or default) | ||
| n_classes_ : int | ||
| Number of classes in classification task. | ||
| ``None`` in regression task. | ||
| models_A_ : list of lists | ||
| List containing n_estimators lists. Each of which contains | ||
| n_folds models (fitted estimators). | ||
| In variant A this models are used to transform (predict) | ||
| both train set and test set | ||
| In variant B this models are used to transform (predict) | ||
| train set only | ||
| models_B_ : list or None | ||
| List containing n_estimators models (fitted estimators). | ||
| In variant A this attribute is None | ||
| In variant B this models are used to transform (predict) test set | ||
| metric_ : callable | ||
| Metric passed by user (or default) which was used | ||
| to compute cross-validation scores | ||
| kf_ : KFold or StratifiedKFold object | ||
| Initialized cross-validation object which was used to split train set | ||
| during fitting | ||
| scores_ : 2d numpy array of shape [n_estimators, n_folds] | ||
| Scores for each fold for each estimator. | ||
| mean_std_ : list of tuples | ||
| Each tuple contains name, mean and std for each estimator. | ||
| train_shape_ : tuple of ints | ||
| Shape of training data | ||
| n_train_examples_ : int | ||
| Number of training examples | ||
| n_features_: int | ||
| Number of features | ||
| train_footprint_ : list of tuples | ||
| Train set footprint which is used to identify train set | ||
| during transform (predict) phase. | ||
| Each tuple containes 3 values: row index, column index, data value | ||
| Examples | ||
| -------- | ||
| >>> from sklearn.datasets import load_boston | ||
| >>> from sklearn.model_selection import train_test_split | ||
| >>> from sklearn.metrics import mean_absolute_error | ||
| >>> from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor | ||
| >>> from xgboost import XGBRegressor | ||
| >>> from vecstack import StackingTransformer | ||
| >>> | ||
| >>> # Load demo data | ||
| >>> boston = load_boston() | ||
| >>> X, y = boston.data, boston.target | ||
| >>> | ||
| >>> # Make train/test split | ||
| >>> X_train, X_test, y_train, y_test = train_test_split(X, y, | ||
| test_size=0.2, | ||
| random_state=0) | ||
| >>> | ||
| >>> # Init 1st level estimators | ||
| >>> estimators_L1 = [('et', ExtraTreesRegressor(random_state=0, | ||
| n_jobs=-1, | ||
| n_estimators=100, | ||
| max_depth=3)), | ||
| ('rf', RandomForestRegressor(random_state=0, | ||
| n_jobs=-1, | ||
| n_estimators=100, | ||
| max_depth=3)), | ||
| ('xgb', XGBRegressor(random_state=0, | ||
| n_jobs=-1, | ||
| learning_rate=0.1, | ||
| n_estimators=100, | ||
| max_depth=3))] | ||
| >>> # Stacking | ||
| >>> stack = StackingTransformer(estimators=estimators_L1, | ||
| regression=True, | ||
| shuffle=True, | ||
| random_state=0, | ||
| verbose=2) | ||
| >>> stack = stack.fit(X_train, y_train) | ||
| >>> S_train = stack.transform(X_train) | ||
| >>> S_test = stack.transform(X_test) | ||
| >>> | ||
| >>> # Use 2nd level estimator to get final prediction | ||
| >>> estimator_L2 = XGBRegressor(random_state=0, | ||
| n_jobs=-1, | ||
| learning_rate=0.1, | ||
| n_estimators=100, | ||
| max_depth=3) | ||
| >>> estimator_L2 = estimator_L2.fit(S_train, y_train) | ||
| >>> y_pred = estimator_L2.predict(S_test) | ||
| >>> | ||
| >>> # Final prediction score | ||
| >>> print('Final score: [%.8f]' % mean_absolute_error(y_test, y_pred)) | ||
| """ | ||
| def __init__(self, | ||
| estimators=None, | ||
| regression=True, | ||
| transform_target=None, | ||
| transform_pred=None, | ||
| variant='A', | ||
| needs_proba=False, | ||
| metric=None, | ||
| n_folds=4, | ||
| stratified=False, | ||
| shuffle=False, | ||
| random_state=0, | ||
| verbose=0): | ||
| self.estimators = estimators | ||
| self.regression = regression | ||
| self.transform_target = transform_target | ||
| self.transform_pred = transform_pred | ||
| self.variant = variant | ||
| self.needs_proba = needs_proba | ||
| self.metric = metric | ||
| self.n_folds = n_folds | ||
| self.stratified = stratified | ||
| self.shuffle = shuffle | ||
| self.random_state = random_state | ||
| self.verbose = verbose | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def fit(self, X, y, sample_weight=None): | ||
| """Fit all base estimators. | ||
| Parameters | ||
| ---------- | ||
| X : 2d numpy array or sparse matrix of shape [n_samples, n_features] | ||
| Training data | ||
| y : 1d numpy array of shape [n_samples] | ||
| Target values. | ||
| sample_weight : 1d numpy array of shape [n_samples] | ||
| Individual weights for each sample. | ||
| Passed to fit method of each estimator. | ||
| Note: will be split automatically for each fold. | ||
| Returns | ||
| ------- | ||
| self : object | ||
| Fitted StackingTransformer instance. | ||
| """ | ||
| # --------------------------------------------------------------------- | ||
| # Validation | ||
| # --------------------------------------------------------------------- | ||
| # --------------------------------------------------------------------- | ||
| # Check input data | ||
| # --------------------------------------------------------------------- | ||
| # Check X and y | ||
| # ``check_estimator`` does not allow ``force_all_finite=False`` | ||
| X, y = check_X_y(X, y, | ||
| accept_sparse=['csr'], # allow csr, cast all others to csr | ||
| force_all_finite=True, # do not allow nan and inf | ||
| multi_output=False) # allow only one column in y_train | ||
| # Check X and sample_weight | ||
| # X is alredy checked, but we need it to compare length of sample_weight | ||
| if sample_weight is not None: | ||
| X, sample_weight = check_X_y(X, sample_weight, | ||
| accept_sparse=['csr'], | ||
| force_all_finite=True, | ||
| multi_output=False) | ||
| # --------------------------------------------------------------------- | ||
| # Check ``estimators`` | ||
| # --------------------------------------------------------------------- | ||
| if self.estimators is None: | ||
| if self.regression: | ||
| self.estimators_ = [('dumregr', DummyRegressor(strategy='constant', constant=5.5))] | ||
| else: | ||
| self.estimators_ = [('dumclf', DummyClassifier(strategy='constant', constant=1))] | ||
| # warnings.warn('No estimators were specified. ' | ||
| # 'Using single dummy estimator as demo.', UserWarning) | ||
| else: | ||
| if 0 == len(self.estimators): | ||
| raise ValueError('List of estimators is empty') | ||
| else: | ||
| # Clone | ||
| self.estimators_ = [(name, clone(estim)) for name, estim in self.estimators] | ||
| # Check names of estimators | ||
| names, estims = zip(*self.estimators_) | ||
| self._validate_names(names) | ||
| # Check if all estimators support ``sample_weight`` | ||
| if sample_weight is not None: | ||
| for name, estim in self.estimators_: | ||
| if not has_fit_parameter(estim, 'sample_weight'): | ||
| raise ValueError('Underlying estimator [%s] does not ' | ||
| 'support sample weights.' % name) | ||
| # --------------------------------------------------------------------- | ||
| # Check other StackingTransformer parameters | ||
| # --------------------------------------------------------------------- | ||
| # ``variant`` | ||
| if self.variant not in ['A', 'B']: | ||
| raise ValueError('Parameter ``variant`` must be set properly') | ||
| # ``n_folds`` | ||
| if not isinstance(self.n_folds, int): | ||
| raise ValueError('Parameter ``n_folds`` must be integer') | ||
| if not self.n_folds > 1: | ||
| raise ValueError('Parameter ``n_folds`` must be not less than 2') | ||
| # ``verbose`` | ||
| if self.verbose not in [0, 1, 2]: | ||
| raise ValueError('Parameter ``verbose`` must be 0, 1, or 2') | ||
| # Additional check for inapplicable parameter combinations | ||
| # If ``regression=True`` we ignore classification-specific | ||
| # parameters and issue user warning | ||
| if self.regression and (self.needs_proba or self.stratified): | ||
| warn_str = ('This is regression task hence classification-specific' | ||
| 'parameters set to ``True`` were ignored:') | ||
| if self.needs_proba: | ||
| self.needs_proba = False | ||
| warn_str += ' ``needs_proba``' | ||
| if self.stratified: | ||
| self.stratified = False | ||
| warn_str += ' ``stratified``' | ||
| warnings.warn(warn_str, UserWarning) | ||
| # --------------------------------------------------------------------- | ||
| # Compute attributes (basic properties of data, number of estimators, etc.) | ||
| # --------------------------------------------------------------------- | ||
| self.train_shape_ = X.shape | ||
| self.n_train_examples_ = X.shape[0] | ||
| self.n_features_ = X.shape[1] | ||
| if not self.regression: | ||
| self.n_classes_ = len(np.unique(y)) | ||
| else: | ||
| self.n_classes_ = None | ||
| self.n_estimators_ = len(self.estimators_) | ||
| self.train_footprint_ = self._get_footprint(X) | ||
| # --------------------------------------------------------------------- | ||
| # Specify default metric | ||
| # --------------------------------------------------------------------- | ||
| if self.metric is None and self.regression: | ||
| self.metric_ = mean_absolute_error | ||
| elif self.metric is None and not self.regression: | ||
| if self.needs_proba: | ||
| self.metric_ = log_loss | ||
| else: | ||
| self.metric_ = accuracy_score | ||
| else: | ||
| self.metric_ = self.metric | ||
| # --------------------------------------------------------------------- | ||
| # Create report header strings and print report header | ||
| # --------------------------------------------------------------------- | ||
| if self.verbose > 0: | ||
| if self.regression: | ||
| task_str = 'task: [regression]' | ||
| else: | ||
| task_str = 'task: [classification]' | ||
| n_classes_str = 'n_classes: [%d]' % self.n_classes_ | ||
| metric_str = 'metric: [%s]' % self.metric_.__name__ | ||
| variant_str = 'variant: [%s]' % self.variant | ||
| n_estimators_str = 'n_estimators: [%d]' % self.n_estimators_ | ||
| print(task_str) | ||
| if not self.regression: | ||
| print(n_classes_str) | ||
| print(metric_str) | ||
| print(variant_str) | ||
| print(n_estimators_str + '\n') | ||
| # --------------------------------------------------------------------- | ||
| # Initialize cross-validation split | ||
| # Stratified can be used only for classification | ||
| # --------------------------------------------------------------------- | ||
| if not self.regression and self.stratified: | ||
| self.kf_ = StratifiedKFold(n_splits=self.n_folds, | ||
| shuffle=self.shuffle, | ||
| random_state=self.random_state) | ||
| # Save target to be able to create stratified split in ``transform`` method | ||
| # This is more efficient than to save split indices | ||
| self._y_ = y.copy() | ||
| else: | ||
| self.kf_ = KFold(n_splits=self.n_folds, | ||
| shuffle=self.shuffle, | ||
| random_state=self.random_state) | ||
| self._y_ = None | ||
| # --------------------------------------------------------------------- | ||
| # Compute implicit number of classes to create appropriate empty arrays. | ||
| # !!! Important. In order to unify array creation | ||
| # variable ``n_classes_implicit_`` is always equal to 1, except the case | ||
| # when we performing classification task with ``needs_proba=True`` | ||
| # --------------------------------------------------------------------- | ||
| if not self.regression and self.needs_proba: | ||
| self.n_classes_implicit_ = len(np.unique(y)) | ||
| self.action_ = 'predict_proba' | ||
| else: | ||
| self.n_classes_implicit_ = 1 | ||
| self.action_ = 'predict' | ||
| # --------------------------------------------------------------------- | ||
| # Create empty numpy array for train predictions (OOF) | ||
| # !!! Important. We have to implicitly predict during fit | ||
| # in order to compute CV scores, because | ||
| # the most reasonable place to print out CV scores is fit method | ||
| # --------------------------------------------------------------------- | ||
| S_train = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_)) | ||
| # --------------------------------------------------------------------- | ||
| # Prepare (clone) estmators for fitting and storing | ||
| # We need models_A_ for both variant A and varian B | ||
| # We need models_B_ for varian B only (in variant A attribute models_B_ is None) | ||
| # --------------------------------------------------------------------- | ||
| self.models_A_ = [] | ||
| self.models_B_ = None | ||
| for n, est in self.estimators_: | ||
| self.models_A_.append([clone(est) for _ in range(self.n_folds)]) | ||
| if self.variant in ['B']: | ||
| self.models_B_ = [clone(est) for n, est in self.estimators_] | ||
| # --------------------------------------------------------------------- | ||
| # Create empty numpy array to store scores for each estimator and each fold | ||
| # --------------------------------------------------------------------- | ||
| self.scores_ = np.zeros((self.n_estimators_, self.n_folds)) | ||
| # --------------------------------------------------------------------- | ||
| # Create empty list to store name, mean and std for each estimator | ||
| # --------------------------------------------------------------------- | ||
| self.mean_std_ = [] | ||
| # --------------------------------------------------------------------- | ||
| # MAIN FIT PROCEDURE | ||
| # --------------------------------------------------------------------- | ||
| # Loop across estimators | ||
| # --------------------------------------------------------------------- | ||
| for estimator_counter, (name, estimator) in enumerate(self.estimators_): | ||
| if self.verbose > 0: | ||
| estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__) | ||
| print(estimator_str) | ||
| # ----------------------------------------------------------------- | ||
| # Loop across folds | ||
| # ----------------------------------------------------------------- | ||
| for fold_counter, (tr_index, te_index) in enumerate(self.kf_.split(X, y)): | ||
| # Split data and target | ||
| X_tr = X[tr_index] | ||
| y_tr = y[tr_index] | ||
| X_te = X[te_index] | ||
| y_te = y[te_index] | ||
| # Split sample weights accordingly (if passed) | ||
| if sample_weight is not None: | ||
| sample_weight_tr = sample_weight[tr_index] | ||
| # sample_weight_te = sample_weight[te_index] | ||
| else: | ||
| sample_weight_tr = None | ||
| # sample_weight_te = None | ||
| # Fit estimator | ||
| _ = self._estimator_action(self.models_A_[estimator_counter][fold_counter], | ||
| X_tr, y_tr, None, | ||
| sample_weight=sample_weight_tr, | ||
| action='fit', | ||
| transform=self.transform_target) | ||
| # Predict out-of-fold part of train set | ||
| if 'predict_proba' == self.action_: | ||
| col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_, | ||
| estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_) | ||
| else: | ||
| col_slice_estimator = estimator_counter | ||
| S_train[te_index, col_slice_estimator] = self._estimator_action(self.models_A_[estimator_counter][fold_counter], | ||
| None, None, | ||
| X_te, action=self.action_, | ||
| transform=self.transform_pred) | ||
| # Compute score | ||
| score = self.metric_(y_te, S_train[te_index, col_slice_estimator]) | ||
| self.scores_[estimator_counter, fold_counter] = score | ||
| # Print fold score | ||
| if self.verbose > 1: | ||
| fold_str = ' fold %2d: [%.8f]' % (fold_counter, score) | ||
| print(fold_str) | ||
| # Compute mean and std and save in dict | ||
| estim_name = self.estimators_[estimator_counter][0] | ||
| estim_mean = np.mean(self.scores_[estimator_counter]) | ||
| estim_std = np.std(self.scores_[estimator_counter]) | ||
| self.mean_std_.append((estim_name, estim_mean, estim_std)) | ||
| if self.verbose > 1: | ||
| sep_str = ' ----' | ||
| print(sep_str) | ||
| # Compute mean + std (and full) | ||
| if self.verbose > 0: | ||
| mean_str = ' MEAN: [%.8f] + [%.8f]\n' % (estim_mean, estim_std) | ||
| print(mean_str) | ||
| # Fit estimator on full train set | ||
| if self.variant in ['B']: | ||
| if self.verbose > 0: | ||
| print(' Fitting on full train set...\n') | ||
| _ = self._estimator_action(self.models_B_[estimator_counter], | ||
| X, y, None, | ||
| sample_weight=sample_weight, | ||
| action='fit', | ||
| transform=self.transform_target) | ||
| # --------------------------------------------------------------------- | ||
| # --------------------------------------------------------------------- | ||
| # Return fitted StackingTransformer instance | ||
| return self | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def fit_transform(self, X, y, sample_weight=None): | ||
| """Fit all base estimators and transform (predict) train set. | ||
| Parameters | ||
| ---------- | ||
| See docs for ``fit`` and ``transform`` methods. | ||
| Returns | ||
| ------- | ||
| X_transformed : 2d numpy array of shape [n_samples, n_estimators] or | ||
| [n_samples, n_estimators * n_classes] | ||
| Out-of-fold predictions (OOF) for train set. | ||
| This is stacked features for next level. | ||
| """ | ||
| # --------------------------------------------------------------------- | ||
| # All validation and procedures are done inside corresponding methods | ||
| # fit and transform | ||
| # --------------------------------------------------------------------- | ||
| return self.fit(X, y, sample_weight).transform(X) | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def transform(self, X, is_train_set=None): | ||
| """Transform (predict) given data set. | ||
| If ``X`` is train set: | ||
| for each estimator return out-of-fold predictions (OOF). | ||
| If ``X`` is any other set: | ||
| variant A: for each estimator return mean (mode) of predictions | ||
| made in each fold | ||
| variant B: for each estimator return single prediction | ||
| Parameters | ||
| ---------- | ||
| X : 2d numpy array or sparse matrix of shape [n_samples, n_features] | ||
| Input data | ||
| is_train_set : boolean, default None | ||
| Fallback parameter. In general case | ||
| should not be used (should be None). | ||
| Gives ability to explicitly specify that given dataset | ||
| is train set or other set. | ||
| Returns | ||
| ------- | ||
| X_transformed : 2d numpy array of shape [n_samples, n_estimators] or | ||
| [n_samples, n_estimators * n_classes] | ||
| Out-of-fold predictions (OOF) for train set. | ||
| Regular or bagged predictions for any other set. | ||
| This is stacked features for next level. | ||
| """ | ||
| # Check if fitted | ||
| check_is_fitted(self, ['models_A_']) | ||
| # Input validation | ||
| # ``check_estimator`` does not allow ``force_all_finite=False`` | ||
| X = check_array(X, accept_sparse=['csr'], force_all_finite=True) | ||
| # ********************************************************************* | ||
| # Fitted StackingTransformer instance is bound to train set used for fitting. | ||
| # So during transformation we have different actions for train set | ||
| # and all other sets | ||
| # ********************************************************************* | ||
| if is_train_set is None: | ||
| is_train_set = self._check_identity(X) | ||
| if self.verbose > 0: | ||
| if is_train_set: | ||
| print('Train set was detected.') | ||
| print('Transforming...\n') | ||
| # ********************************************************************* | ||
| # Transform train set | ||
| # ********************************************************************* | ||
| if is_train_set: | ||
| # In case if user directly tells that it is train set but shape is different | ||
| if self.train_shape_ != X.shape: | ||
| raise ValueError('Train set must have the same shape ' | ||
| 'in order to be transformed.') | ||
| # Create empty numpy array for train predictions (OOF) | ||
| S_train = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_)) | ||
| # ----------------------------------------------------------------- | ||
| # MAIN TRANSFORM (PREDICT) PROCEDURE for train set | ||
| # ----------------------------------------------------------------- | ||
| # Loop across estimators | ||
| # ----------------------------------------------------------------- | ||
| for estimator_counter, (name, estimator) in enumerate(self.estimators_): | ||
| if self.verbose > 0: | ||
| estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__) | ||
| print(estimator_str) | ||
| # ------------------------------------------------------------- | ||
| # Loop across folds | ||
| # ------------------------------------------------------------- | ||
| for fold_counter, (tr_index, te_index) in enumerate(self.kf_.split(X, self._y_)): | ||
| # Split data | ||
| # X_tr = X[tr_index] | ||
| X_te = X[te_index] | ||
| # Predict out-of-fold part of train set | ||
| if 'predict_proba' == self.action_: | ||
| col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_, | ||
| estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_) | ||
| else: | ||
| col_slice_estimator = estimator_counter | ||
| S_train[te_index, col_slice_estimator] = self._estimator_action(self.models_A_[estimator_counter][fold_counter], | ||
| None, None, | ||
| X_te, action=self.action_, | ||
| transform=self.transform_pred) | ||
| if self.verbose > 1: | ||
| fold_str = ' model from fold %2d: done' % fold_counter | ||
| print(fold_str) | ||
| if self.verbose > 1: | ||
| sep_str = ' ----' | ||
| print(sep_str) | ||
| if self.verbose > 0: | ||
| done_str = ' DONE\n' | ||
| print(done_str) | ||
| # ----------------------------------------------------------------- | ||
| # Cast class labels to int | ||
| # ----------------------------------------------------------------- | ||
| if not self.regression and not self.needs_proba: | ||
| S_train = S_train.astype(int) | ||
| # Return transformed data (OOF) | ||
| return S_train # X_transformed | ||
| # ********************************************************************* | ||
| # Transform any other set | ||
| # ********************************************************************* | ||
| else: | ||
| # Check n_features | ||
| if X.shape[1] != self.n_features_: | ||
| raise ValueError('Inconsistent number of features.') | ||
| # Create empty numpy array for test predictions | ||
| S_test = np.zeros((X.shape[0], self.n_estimators_ * self.n_classes_implicit_)) | ||
| # --------------------------------------------------------------------- | ||
| # MAIN TRANSFORM (PREDICT) PROCEDURE for any other set | ||
| # ----------------------------------------------------------------- | ||
| # Loop across estimators | ||
| # ----------------------------------------------------------------- | ||
| for estimator_counter, (name, estimator) in enumerate(self.estimators_): | ||
| if self.verbose > 0: | ||
| estimator_str = 'estimator %2d: [%s: %s]' % (estimator_counter, name, estimator.__class__.__name__) | ||
| print(estimator_str) | ||
| # ------------------------------------------------------------- | ||
| # Variant A | ||
| # ------------------------------------------------------------- | ||
| if self.variant in ['A']: | ||
| # Create empty numpy array, which will contain temporary predictions | ||
| # for test set made in each fold | ||
| S_test_temp = np.zeros((X.shape[0], self.n_folds * self.n_classes_implicit_)) | ||
| # --------------------------------------------------------- | ||
| # Loop across fitted models (it is the same as loop across folds) | ||
| # --------------------------------------------------------- | ||
| for fold_counter, model in enumerate(self.models_A_[estimator_counter]): | ||
| # Predict test set in each fold | ||
| if 'predict_proba' == self.action_: | ||
| col_slice_fold = slice(fold_counter * self.n_classes_implicit_, | ||
| fold_counter * self.n_classes_implicit_ + self.n_classes_implicit_) | ||
| else: | ||
| col_slice_fold = fold_counter | ||
| S_test_temp[:, col_slice_fold] = self._estimator_action(model, None, None, X, | ||
| action=self.action_, | ||
| transform=self.transform_pred) | ||
| if self.verbose > 1: | ||
| fold_str = ' model from fold %2d: done' % fold_counter | ||
| print(fold_str) | ||
| if self.verbose > 1: | ||
| sep_str = ' ----' | ||
| print(sep_str) | ||
| # --------------------------------------------------------- | ||
| # Compute mean or mode (majority voting) of predictions for test set | ||
| # --------------------------------------------------------- | ||
| if 'predict_proba' == self.action_: | ||
| # Here we copute means of probabilirties for each class | ||
| for class_id in range(self.n_classes_implicit_): | ||
| S_test[:, estimator_counter * self.n_classes_implicit_ + class_id] = np.mean(S_test_temp[:, class_id::self.n_classes_implicit_], axis=1) | ||
| else: | ||
| if self.regression: | ||
| S_test[:, estimator_counter] = np.mean(S_test_temp, axis=1) | ||
| else: | ||
| S_test[:, estimator_counter] = st.mode(S_test_temp, axis=1)[0].ravel() | ||
| if self.verbose > 0: | ||
| done_str = ' DONE\n' | ||
| print(done_str) | ||
| # ------------------------------------------------------------- | ||
| # Variant B | ||
| # ------------------------------------------------------------- | ||
| else: | ||
| if 'predict_proba' == self.action_: | ||
| col_slice_estimator = slice(estimator_counter * self.n_classes_implicit_, | ||
| estimator_counter * self.n_classes_implicit_ + self.n_classes_implicit_) | ||
| else: | ||
| col_slice_estimator = estimator_counter | ||
| S_test[:, col_slice_estimator] = self._estimator_action(self.models_B_[estimator_counter], | ||
| None, None, X, | ||
| action=self.action_, | ||
| transform=self.transform_pred) | ||
| if self.verbose > 0: | ||
| done_str = ' DONE\n' | ||
| print(done_str) | ||
| # --------------------------------------------------------------------- | ||
| # Cast class labels to int | ||
| # --------------------------------------------------------------------- | ||
| if not self.regression and not self.needs_proba: | ||
| S_test = S_test.astype(int) | ||
| return S_test # X_transformed | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def _transformer(self, y, func=None): | ||
| """Transforms target variable and prediction | ||
| """ | ||
| if func is None: | ||
| return y | ||
| else: | ||
| return func(y) | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def _estimator_action(self, estimator, X_train, y_train, X_test, | ||
| sample_weight=None, action=None, | ||
| transform=None): | ||
| """Performs estimator action. | ||
| This wrapper gives us ability to choose action dynamically | ||
| (e.g. ``predict`` or ``predict_proba``). | ||
| Note. Through ``_estimator_action`` and then through ``_transformer`` | ||
| we apply ``transform_target`` and ``transform_pred`` functions if | ||
| given by user on the target and prediction in each fold separately | ||
| to be able to calculate proper scores. | ||
| """ | ||
| if 'fit' == action: | ||
| # We use following condition, because some estimators (e.g. Lars) | ||
| # may not have ``sample_weight`` parameter of ``fit`` method | ||
| if sample_weight is not None: | ||
| return estimator.fit(X_train, self._transformer(y_train, func=transform), | ||
| sample_weight=sample_weight) | ||
| else: | ||
| return estimator.fit(X_train, self._transformer(y_train, func=transform)) | ||
| elif 'predict' == action: | ||
| return self._transformer(estimator.predict(X_test), func=transform) | ||
| elif 'predict_proba' == action: | ||
| return self._transformer(estimator.predict_proba(X_test), func=transform) | ||
| else: | ||
| raise ValueError('Parameter action must be set properly') | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def _get_footprint(self, X, n_items=1000): | ||
| """Selects ``n_items`` random elements from 2d numpy array or | ||
| sparse matrix (or all elements if their number is less or equal | ||
| to ``n_items``). | ||
| """ | ||
| try: | ||
| footprint = [] | ||
| r, c = X.shape | ||
| n = r * c | ||
| # np.random.seed(0) # for development | ||
| ids = np.random.choice(n, min(n_items, n), replace=False) | ||
| for i in ids: | ||
| row = i // c | ||
| col = i - row * c | ||
| footprint.append((row, col, X[row, col])) | ||
| return footprint | ||
| except Exception: | ||
| raise ValueError('Internal error. ' | ||
| 'Please save traceback and inform developers.') | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def _check_identity(self, X, | ||
| rtol=1e-05, atol=1e-08, | ||
| equal_nan=False): | ||
| """Checks 2d numpy array or sparse matrix identity | ||
| by its shape and footprint. | ||
| """ | ||
| try: | ||
| # Check shape | ||
| if X.shape != self.train_shape_: | ||
| return False | ||
| # Check footprint | ||
| try: | ||
| for coo in self.train_footprint_: | ||
| assert np.isclose(X[coo[0], coo[1]], coo[2], rtol=rtol, atol=atol, equal_nan=equal_nan) | ||
| return True | ||
| except AssertionError: | ||
| return False | ||
| except Exception: | ||
| raise ValueError('Internal error. ' | ||
| 'Please save traceback and inform developers.') | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def _get_params(self, attr, deep=True): | ||
| """Gives ability to get parameters of nested estimators | ||
| """ | ||
| out = super(StackingTransformer, self).get_params(deep=False) | ||
| if not deep: | ||
| return out | ||
| estimators = getattr(self, attr) | ||
| if estimators is None: | ||
| return out | ||
| out.update(estimators) | ||
| for name, estimator in estimators: | ||
| for key, value in six.iteritems(estimator.get_params(deep=True)): | ||
| out['%s__%s' % (name, key)] = value | ||
| return out | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def get_params(self, deep=True): | ||
| """Get parameters of StackingTransformer and base estimators. | ||
| Parameters | ||
| ---------- | ||
| deep : boolean | ||
| If False - get parameters of StackingTransformer | ||
| If True - get parameters of StackingTransformer and base estimators | ||
| """ | ||
| return self._get_params('estimators', deep=deep) | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def _validate_names(self, names): | ||
| """Validates estimator names | ||
| """ | ||
| if len(set(names)) != len(names): | ||
| raise ValueError('Names provided are not unique: ' | ||
| '%s' % list(names)) | ||
| invalid_names = set(names).intersection(self.get_params(deep=False)) | ||
| if invalid_names: | ||
| raise ValueError('Estimator names conflict with constructor ' | ||
| 'arguments: %s' % sorted(invalid_names)) | ||
| invalid_names = [name for name in names if '__' in name] | ||
| if invalid_names: | ||
| raise ValueError('Estimator names must not contain __: got ' | ||
| '%s' % invalid_names) | ||
| # ------------------------------------------------------------------------- | ||
| # ------------------------------------------------------------------------- | ||
| def is_train_set(self, X): | ||
| """Checks if given data set was used to train | ||
| StackingTransformer instance. | ||
| Parameters | ||
| ---------- | ||
| X : 2d numpy array or sparse matrix of shape [n_samples, n_features] | ||
| Input data | ||
| Returns | ||
| ------- | ||
| check_result : boolean | ||
| True - if X was used to train StackingTransformer instance | ||
| False - otherwise | ||
| """ | ||
| # Check if fitted | ||
| check_is_fitted(self, ['models_A_']) | ||
| # Input validation | ||
| X = check_array(X, accept_sparse=['csr'], force_all_finite=True) | ||
| return self._check_identity(X) | ||
| # ----------------------------------------------------------------------------- | ||
| # ----------------------------------------------------------------------------- |
+1
-1
| Metadata-Version: 1.1 | ||
| Name: vecstack | ||
| Version: 0.2.2 | ||
| Version: 0.3.0 | ||
| Summary: Python package for stacking (machine learning technique) | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/vecxoz/vecstack |
+1
-1
@@ -6,3 +6,3 @@ #! /usr/bin/env python | ||
| setup(name='vecstack', | ||
| version='0.2.2', | ||
| version='0.3.0', | ||
| description='Python package for stacking (machine learning technique)', | ||
@@ -9,0 +9,0 @@ long_description='Convenient way to automate OOF computation, prediction and bagging using any number of models', |
| Metadata-Version: 1.1 | ||
| Name: vecstack | ||
| Version: 0.2.2 | ||
| Version: 0.3.0 | ||
| Summary: Python package for stacking (machine learning technique) | ||
@@ -5,0 +5,0 @@ Home-page: https://github.com/vecxoz/vecstack |
| setup.py | ||
| vecstack/__init__.py | ||
| vecstack/core.py | ||
| vecstack/coresk.py | ||
| vecstack.egg-info/PKG-INFO | ||
@@ -5,0 +6,0 @@ vecstack.egg-info/SOURCES.txt |
+13
-5
| """Python package for stacking (machine learning technique) | ||
| Find out how to use: | ||
| >>>from vecstack import stacking | ||
| >>>help(stacking) | ||
| >>> from vecstack import stacking | ||
| >>> help(stacking) | ||
| >>> from vecstack import StackingTransformer | ||
| >>> help(StackingTransformer) | ||
@@ -31,9 +33,15 @@ MIT License | ||
| #------------------------------------------------------------------------------- | ||
| #------------------------------------------------------------------------------- | ||
| # ----------------------------------------------------------------------------- | ||
| # ----------------------------------------------------------------------------- | ||
| from .core import stacking | ||
| from .coresk import StackingTransformer | ||
| __author__ = 'Igor Ivanov > kaggle.com/vecxoz' | ||
| __license__ = 'MIT' | ||
| __version__ = '0.2.2' | ||
| __version__ = '0.3.0' | ||
| __all__ = ['stacking', 'StackingTransformer'] | ||
| # ----------------------------------------------------------------------------- | ||
| # ----------------------------------------------------------------------------- |
+23
-17
@@ -1,6 +0,6 @@ | ||
| """Python package for stacking (machine learning technique) | ||
| """Functional API for stacking. | ||
| Find out how to use: | ||
| >>>from vecstack import stacking | ||
| >>>help(stacking) | ||
| >>> from vecstack import stacking | ||
| >>> help(stacking) | ||
@@ -51,3 +51,5 @@ MIT License | ||
| from sklearn.metrics import log_loss | ||
| from sklearn.utils.validation import check_X_y, check_array | ||
| from sklearn.utils.validation import check_X_y | ||
| from sklearn.utils.validation import check_array | ||
| from sklearn.base import clone | ||
@@ -451,3 +453,3 @@ #------------------------------------------------------------------------------- | ||
| if regression and (needs_proba or stratified): | ||
| warn_str = 'Task is regression <regression=True> hence function ignored classification-specific parameters which were set as <True>:' | ||
| warn_str = 'This is regression task hence classification-specific parameters set to <True> were ignored:' | ||
| if needs_proba: | ||
@@ -475,9 +477,9 @@ needs_proba = False | ||
| if regression: | ||
| task_str = 'task: [regression]' | ||
| task_str = 'task: [regression]' | ||
| else: | ||
| task_str = 'task: [classification]' | ||
| n_classes_str = 'n_classes: [%d]' % len(np.unique(y_train)) | ||
| metric_str = 'metric: [%s]' % metric.__name__ | ||
| mode_str = 'mode: [%s]' % mode | ||
| n_models_str = 'n_models: [%d]' % len(models) | ||
| task_str = 'task: [classification]' | ||
| n_classes_str = 'n_classes: [%d]' % len(np.unique(y_train)) | ||
| metric_str = 'metric: [%s]' % metric.__name__ | ||
| mode_str = 'mode: [%s]' % mode | ||
| n_models_str = 'n_models: [%d]' % len(models) | ||
@@ -534,3 +536,3 @@ # Print report header | ||
| if save_dir is not None or verbose > 0: | ||
| model_str = 'model %d: [%s]' % (model_counter, model.__class__.__name__) | ||
| model_str = 'model %2d: [%s]' % (model_counter, model.__class__.__name__) | ||
| if save_dir is not None: | ||
@@ -565,6 +567,10 @@ models_folds_str += '-' * 40 + '\n' | ||
| sample_weight_tr = sample_weight[tr_index] | ||
| sample_weight_te = sample_weight[te_index] | ||
| # sample_weight_te = sample_weight[te_index] | ||
| else: | ||
| sample_weight_tr = None | ||
| sample_weight_te = None | ||
| # sample_weight_te = None | ||
| # Save RAM: clone to avoid fitting model directly inside users list | ||
| # Set safe=False to be able to clone non-sklearn models | ||
| model = clone(model, safe=False) | ||
@@ -596,3 +602,3 @@ # Fit 1-st level model | ||
| scores = np.append(scores, score) | ||
| fold_str = ' fold %d: [%.8f]' % (fold_counter, score) | ||
| fold_str = ' fold %2d: [%.8f]' % (fold_counter, score) | ||
| if save_dir is not None: | ||
@@ -619,4 +625,4 @@ models_folds_str += fold_str + '\n' | ||
| sep_str = ' ----' | ||
| mean_str = ' MEAN: [%.8f] + [%.8f]' % (np.mean(scores), np.std(scores)) | ||
| full_str = ' FULL: [%.8f]\n' % (metric(y_train, S_train[:, col_slice_model])) | ||
| mean_str = ' MEAN: [%.8f] + [%.8f]' % (np.mean(scores), np.std(scores)) | ||
| full_str = ' FULL: [%.8f]\n' % (metric(y_train, S_train[:, col_slice_model])) | ||
| if save_dir is not None: | ||
@@ -623,0 +629,0 @@ models_folds_str += sep_str + '\n' |
Alert delta unavailable
Currently unable to show alert delta for PyPI packages.
85314
124.17%12
9.09%1527
130.32%