Source code for skranger.tree._tree

import numpy as np
from sklearn.tree._tree import csr_matrix


[docs]class Tree:
    """The low-level tree interface.

    Tree objects can be accessed using the ``tree_`` attribute on fitted
    decision tree estimators. Instances of ``Tree`` provide methods and
    properties describing the underlying structure and attributes of the
    tree.
    """

    def __init__(self, *, ranger_forest):
        self.ranger_forest = ranger_forest

    @property
    def node_count(self):
        """The quantity of nodes in the tree."""
        return len(self.children_left)

    @property
    def capacity(self):
        """The capacity of the node array."""
        return len(self.children_left)

    @property
    def children_left(self):
        """Left children nodes."""
        # sklearn uses -1, ranger uses 0
        return np.array(
            [
                -1 if n == 0 else n
                for n in self.ranger_forest["forest"]["child_node_ids"][0][0]
            ]
        )

    @property
    def children_right(self):
        """Right children nodes."""
        # sklearn uses -1, ranger uses 0
        return np.array(
            [
                -1 if n == 0 else n
                for n in self.ranger_forest["forest"]["child_node_ids"][0][1]
            ]
        )

    @property
    def n_outputs(self):
        """The quantity of outputs of the tree."""
        # single output only
        return 1

    @property
    def n_classes(self):
        """The quantity of classes."""
        return np.array([self.ranger_forest["n_classes"]])

[docs]    def get_depth(self):
        """Calculate the maximum depth of the tree."""
        left = self.children_left
        right = self.children_right
        root_node = 0
        return self._get_depth(left, right, root_node)

    def _get_depth(self, left, right, idx):
        if left[idx] == -1:
            return 0
        return 1 + max(
            self._get_depth(left, right, left[idx]),
            self._get_depth(left, right, right[idx]),
        )

[docs]    def get_n_leaves(self):
        """Calculate the number of leaves of the tree."""
        left = self.children_left
        right = self.children_right
        root_node = 0
        return self._get_n_leaves(left, right, root_node)

    def _get_n_leaves(self, left, right, idx):
        if left[idx] == -1:
            return 1
        return self._get_n_leaves(left, right, left[idx]) + self._get_n_leaves(
            left, right, right[idx]
        )

[docs]    def apply(self, X):
        """Calculate the leaf index for each sample.

        :param array2d X: training input features
        """
        return np.apply_along_axis(self._apply, 1, X)

    def _apply(self, x, idx=None):
        if idx is None:
            idx = 0
            return self._apply(x, idx)
        if self.children_left[idx] == -1:
            return idx
        varid = self.feature[idx]
        val = self.threshold[idx]
        if x[varid] <= val:
            idx = self.children_left[idx]
        else:
            idx = self.children_right[idx]
        return self._apply(x, idx)

[docs]    def decision_path(self, X):
        """Calculate the decision path through the tree for each sample.

        :param array2d X: training input features
        """
        if hasattr(X, "values"):  # pd.Dataframe
            Xvalues = X.values
        else:
            Xvalues = X
        paths = [self._decision_path(x) for x in Xvalues]
        rows = [np.ones(len(p), dtype=int) * idx for idx, p in enumerate(paths)]
        rows = np.concatenate(rows, axis=0)
        cols = np.concatenate(paths, axis=0)
        data = np.ones(len(rows), dtype=int)
        return csr_matrix((data, (rows, cols)))

    def _decision_path(self, x, idx=None):
        if idx is None:
            idx = 0
            return [idx] + self._decision_path(x, idx)
        if self.children_left[idx] == -1:
            return []
        varid = self.feature[idx]
        val = self.threshold[idx]
        if x[varid] <= val:
            idx = self.children_left[idx]
        else:
            idx = self.children_right[idx]
        return [idx] + self._decision_path(x, idx)

    @property
    def max_depth(self):
        """Max depth of the tree."""
        return self.get_depth()

    @property
    def feature(self):
        """Variables on which nodes are split."""
        # sklearn uses -2, ranger uses 0
        return np.array(
            [
                -2 if n == 0 else v
                for n, v in zip(
                    self.ranger_forest["forest"]["child_node_ids"][0][0],
                    self.ranger_forest["forest"]["split_var_ids"][0],
                )
            ]
        )

    @property
    def threshold(self):
        """Threshold values on which nodes are split."""
        # sklearn uses -2, ranger uses 0
        return np.array(
            [
                -2 if n == 0 else v
                for n, v in zip(
                    self.ranger_forest["forest"]["child_node_ids"][0][0],
                    self.ranger_forest["forest"]["split_values"][0],
                )
            ]
        )

    @property
    def n_node_samples(self):
        """The number of samples reaching each node."""
        n_samples = [
            len(node) if node else 0
            for node in self.ranger_forest["forest"]["leaf_samples"][0]
        ]
        self._get_n_node_samples(self.children_left, self.children_right, 0, n_samples)
        return np.array(n_samples)

    def _get_n_node_samples(self, left, right, idx, n_samples):
        left_n_node_samples = (
            n_samples[idx]
            if left[idx] == -1
            else self._get_n_node_samples(left, right, left[idx], n_samples)
        )
        right_n_node_samples = (
            n_samples[idx]
            if right[idx] == -1
            else self._get_n_node_samples(left, right, right[idx], n_samples)
        )
        n_samples[idx] = left_n_node_samples + right_n_node_samples
        return n_samples[idx]

    @property
    def weighted_n_node_samples(self):
        """The sum of the weights of the samples reaching each node."""
        weighted_n_samples = self.ranger_forest["forest"]["leaf_weights"][0].copy()
        self._get_n_node_samples(
            self.children_left,
            self.children_right,
            0,
            weighted_n_samples,
        )
        return np.array(weighted_n_samples)

    @property
    def value(self):
        """The constant prediction value of each node."""
        values = self.ranger_forest["forest"]["node_values"][0]
        return np.reshape(values, (len(values), 1, self.n_classes[0]))