Source code for skranger.ensemble.regressor

"""Scikit-learn wrapper for ranger regression."""
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
from sklearn.exceptions import NotFittedError
from sklearn.utils.validation import check_array
from sklearn.utils.validation import check_is_fitted

from skranger import ranger
from skranger.ensemble.base import BaseRangerForest
from skranger.tree import RangerTreeRegressor


[docs]class RangerForestRegressor(BaseRangerForest, RegressorMixin, BaseEstimator):
    r"""Ranger Random Forest Regression implementation for sci-kit learn.

    Provides a sklearn regressor interface to the Ranger C++ library using Cython.

    :param int n_estimators: The number of tree regressors to train
    :param bool verbose: Enable ranger's verbose logging
    :param int/callable mtry: The number of features to split on each node. When a
        callable is passed, the function must accept a single parameter which is the
        number of features passed, and return some value between 1 and the number of
        features.
    :param str importance: One of one of ``none``, ``impurity``, ``impurity_corrected``,
        ``permutation``.
    :param int min_node_size: The minimal node size.
    :param int max_depth: The maximal tree depth; 0 means unlimited.
    :param bool replace: Sample with replacement.
    :param float/list sample_fraction: The fraction of observations to sample. The
        default is 1 when sampling with replacement, and 0.632 otherwise. This can be a
        list of class specific values.
    :param bool keep_inbag: If true, save how often observations are in-bag in each
        tree. These will be stored in the ``ranger_forest_`` attribute under the key
        ``"inbag_counts"``.
    :param list inbag: A list of size ``n_estimators``, containing inbag counts for each
        observation. Can be used for stratified sampling.
    :param str split_rule: One of ``variance``, ``extratrees``, ``maxstat``, ``beta``;
        default ``variance``.
    :param int num_random_splits: The number of random splits to consider for the
        ``extratrees`` splitrule.
    :param float alpha: Significance threshold to allow splitting for the ``maxstat``
        split rule.
    :param float minprop: Lower quantile of covariate distribution to be considered for
        splitting for ``maxstat`` split rule.
    :param str respect_categorical_features: One of ``ignore``, ``order``, ``partition``.
        The default is ``partition`` for the ``extratrees`` splitrule, otherwise the
        default is ``ignore``.
    :param bool scale_permutation_importance: For ``permutation`` importance,
        scale permutation importance by standard error as in (Breiman 2001).
    :param bool local_importance: For ``permutation`` importance, calculate and
        return local importance values as (Breiman 2001).
    :param list regularization_factor: A vector of regularization factors for the
        features.
    :param bool regularization_usedepth: Whether to consider depth in regularization.
    :param bool holdout: Hold-out all samples with case weight 0 and use these for
        feature importance and prediction error.
    :param bool quantiles: Enable quantile regression after fitting. This must be
        set to ``True`` in order to call ``predict_quantiles`` after fitting.
    :param bool oob_error: Whether to calculate out-of-bag prediction error.
    :param int n_jobs: The number of threads. Default is number of CPU cores.
    :param bool save_memory: Save memory at the cost of speed growing trees.
    :param int seed: Random seed value.
    :param bool enable_tree_details: When ``True``, perform additional calculations
        for detailing the underlying decision trees. Must be enabled for ``estimators_``
        and ``get_estimator`` to work. Very slow.

    :ivar int n_features_in\_: The number of features (columns) from the fit input
        ``X``.
    :ivar list feature_names\_: Names for the features of the fit input ``X``.
    :ivar dict ranger_forest\_: The returned result object from calling C++ ranger.
    :ivar int mtry\_: The mtry value as determined if ``mtry`` is callable, otherwise
        it is the same as ``mtry``.
    :ivar float sample_fraction\_: The sample fraction determined by input validation
    :ivar list regularization_factor\_: The regularization factors determined by input
        validation.
    :ivar list unordered_features\_: The unordered feature names determined by
        input validation.
    :ivar int split_rule\_: The split rule integer corresponding to ranger enum
        ``SplitRule``.
    :ivar bool use_regularization_factor\_: Input validation determined bool for using
        regularization factor input parameter.
    :ivar str respect_categorical_features\_: Input validation determined string
        respecting categorical features.
    :ivar int importance_mode\_: The importance mode integer corresponding to ranger
        enum ``ImportanceMode``.
    :ivar 2darray random_node_values\_: Random training target values based on
        trained forest terminal nodes for the purpose of quantile regression.
    :ivar ndarray feature_importances\_: The variable importances from ranger.
    """

    def __init__(
        self,
        n_estimators=100,
        *,
        verbose=False,
        mtry=0,
        importance="none",
        min_node_size=0,
        max_depth=0,
        replace=True,
        sample_fraction=None,
        keep_inbag=False,
        inbag=None,
        split_rule="variance",
        num_random_splits=1,
        alpha=0.5,
        minprop=0.1,
        split_select_weights=None,
        always_split_features=None,
        categorical_features=None,
        respect_categorical_features=None,
        scale_permutation_importance=False,
        local_importance=False,
        regularization_factor=None,
        regularization_usedepth=False,
        holdout=False,
        quantiles=False,
        oob_error=False,
        n_jobs=-1,
        save_memory=False,
        seed=42,
        enable_tree_details=False,
    ):
        self.n_estimators = n_estimators
        self.verbose = verbose
        self.mtry = mtry
        self.importance = importance
        self.min_node_size = min_node_size
        self.max_depth = max_depth
        self.replace = replace
        self.sample_fraction = sample_fraction
        self.keep_inbag = keep_inbag
        self.inbag = inbag
        self.split_rule = split_rule
        self.num_random_splits = num_random_splits
        self.alpha = alpha
        self.minprop = minprop
        self.split_select_weights = split_select_weights
        self.always_split_features = always_split_features
        self.categorical_features = categorical_features
        self.respect_categorical_features = respect_categorical_features
        self.scale_permutation_importance = scale_permutation_importance
        self.local_importance = local_importance
        self.regularization_factor = regularization_factor
        self.regularization_usedepth = regularization_usedepth
        self.holdout = holdout
        self.quantiles = quantiles
        self.oob_error = oob_error
        self.n_jobs = n_jobs
        self.save_memory = save_memory
        self.seed = seed
        self.enable_tree_details = enable_tree_details

    @property
    def estimators_(self):
        try:
            check_is_fitted(self)
        except NotFittedError:
            raise AttributeError(
                f"{self.__class__.__name__} object has no attribute 'estimators_'"
            ) from None
        if not self.enable_tree_details:
            raise ValueError("enable_tree_details must be True prior to training")
        return [
            RangerTreeRegressor.from_forest(self, idx=idx)
            for idx in range(self.n_estimators)
        ]

[docs]    def get_estimator(self, idx):
        """Extract a single estimator tree from the forest.
        :param int idx: The index of the tree to extract.
        """
        check_is_fitted(self)
        if not self.enable_tree_details:
            raise ValueError("enable_tree_details must be True prior to training")
        return RangerTreeRegressor.from_forest(self, idx=idx)

[docs]    def fit(
        self,
        X,
        y,
        sample_weight=None,
        split_select_weights=None,
        always_split_features=None,
        categorical_features=None,
    ):
        """Fit the ranger random forest using training data.

        :param array2d X: training input features
        :param array1d y: training input targets
        :param array1d sample_weight: optional weights for input samples
        :param list split_select_weights: Vector of weights between 0 and 1 of
            probabilities to select features for splitting. Can be a single vector or a
            vector of vectors with one vector per tree.
        :param list always_split_features:  Features which should always be selected for
            splitting. A list of column index values.
        :param list categorical_features: A list of column index values which should be
            considered categorical, or unordered.
        """
        self.tree_type_ = 3  # tree_type, TREE_REGRESSION

        # Check input
        X, y = self._validate_data(X, y)

        # Check the init parameters
        self._validate_parameters(X, y, sample_weight)

        # Set X info
        self.feature_names_ = [str(c).encode() for c in range(X.shape[1])]
        self._check_n_features(X, reset=True)

        # Check weights
        sample_weight, use_sample_weight = self._check_sample_weight(sample_weight, X)
        (
            always_split_features,
            use_always_split_features,
        ) = self._check_always_split_features(always_split_features)

        (
            categorical_features,
            use_categorical_features,
        ) = self._check_categorical_features(categorical_features)

        (
            split_select_weights,
            use_split_select_weights,
        ) = self._check_split_select_weights(split_select_weights)

        # Fit the forest
        self.ranger_forest_ = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray(np.atleast_2d(y).astype("float64").transpose()),
            self.feature_names_,  # variable_names
            self.mtry_,
            self.n_estimators,  # num_trees
            self.verbose,
            self.seed,
            self.n_jobs_,  # num_threads
            True,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            split_select_weights,
            use_split_select_weights,
            always_split_features,  # always_split_feature_names
            bool(always_split_features),  # use_always_split_feature_names
            False,  # prediction_mode
            {},  # loaded_forest
            self.replace,  # sample_with_replacement
            False,  # probability
            categorical_features,  # unordered_feature_names
            use_categorical_features,  # use_unordered_features
            self.save_memory,
            self.split_rule_,
            sample_weight,  # case_weights
            use_sample_weight,  # use_case_weights
            {},  # class_weights
            False,  # predict_all
            self.keep_inbag,
            self.sample_fraction_,
            self.alpha,
            self.minprop,
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            False,  # use_regularization_factor
            self.regularization_usedepth,
        )
        sample_weight = sample_weight if len(sample_weight) > 0 else np.ones(len(X))

        # build the leaf samples
        terminal_node_forest = self._get_terminal_node_forest(X)
        terminal_nodes = np.atleast_2d(terminal_node_forest["predictions"]).astype(int)

        if self.quantiles:
            self.random_node_values_ = np.empty(
                (np.max(terminal_nodes) + 1, self.n_estimators)
            )
            self.random_node_values_[:] = np.nan
            for tree in range(self.n_estimators):
                idx = np.arange(X.shape[0])
                np.random.shuffle(idx)
                self.random_node_values_[terminal_nodes[idx, tree], tree] = y[idx]

        if self.enable_tree_details:
            self._set_leaf_samples(terminal_nodes)
            self._set_node_values(y, sample_weight)
            self._set_n_classes()
        return self

[docs]    def predict_quantiles(self, X, quantiles):
        """Predict quantile regression target for X.

        :param array2d X: prediction input features
        :param list(float) quantiles: a list of quantiles on which to predict.
          If the list contains a single quantile, the result will be a 1darray.
          If there are multiple quantiles, the result will be a 2darray with
          columns corresponding to respective quantiles.
        """
        if not hasattr(self, "random_node_values_"):
            raise ValueError("Must set quantiles = True for quantile predictions.")
        check_is_fitted(self)
        X = check_array(X)
        self._check_n_features(X, reset=False)

        forest = self._get_terminal_node_forest(X)
        terminal_nodes = np.array(forest["predictions"]).astype(int)
        terminal_nodes = np.atleast_2d(terminal_nodes)
        node_values = 0.0 * terminal_nodes
        for tree in range(self.n_estimators):
            node_values[:, tree] = self.random_node_values_[
                terminal_nodes[:, tree], tree
            ]
        quantile_predictions = np.quantile(node_values, quantiles, axis=1)
        if len(quantiles) == 1:
            return np.squeeze(quantile_predictions)
        return quantile_predictions.T

[docs]    def predict(self, X, quantiles=None):
        """Predict regression target for X.

        If quantiles are passed, predict quantiles instead.

        :param array2d X: prediction input features
        :param list(float) quantiles: a list of quantiles on which to predict.
          If the list contains a single quantile, the result will be a 1darray.
          If there are multiple quantiles, the result will be a 2darray with
          columns corresponding to respective quantiles. If quantiles are not provided
          the result is the regression target estimate.
        """
        if quantiles is not None:
            return self.predict_quantiles(X, quantiles)
        check_is_fitted(self)
        X = check_array(X)
        self._check_n_features(X, reset=False)

        result = ranger.ranger(
            self.tree_type_,
            np.asfortranarray(X.astype("float64")),
            np.asfortranarray([[]]),
            self.feature_names_,  # variable_names
            self.mtry_,
            self.n_estimators,  # num_trees
            self.verbose,
            self.seed,
            self.n_jobs_,  # num_threads
            False,  # write_forest
            self.importance_mode_,
            self.min_node_size,
            self.split_select_weights or [],
            bool(self.split_select_weights),  # use_split_select_weights
            [],  # always_split_feature_names
            False,  # use_always_split_feature_names
            True,  # prediction_mode
            self.ranger_forest_["forest"],  # loaded_forest
            self.replace,  # sample_with_replacement
            False,  # probability
            [],  # unordered_feature_names
            False,  # use_unordered_features
            self.save_memory,
            self.split_rule_,
            [],  # case_weights
            False,  # use_case_weights
            {},  # class_weights
            False,  # predict_all
            self.keep_inbag,
            [1],  # sample_fraction
            self.alpha,
            self.minprop,
            self.holdout,
            1,  # prediction_type
            self.num_random_splits,
            self.oob_error,
            self.max_depth,
            self.inbag or [],
            bool(self.inbag),  # use_inbag
            self.regularization_factor_,
            self.use_regularization_factor_,
            self.regularization_usedepth,
        )
        return np.array(result["predictions"])

    def _more_tags(self):
        return {
            "_xfail_checks": {
                "check_sample_weights_invariance": "zero sample_weight is not equivalent to removing samples",
            }
        }