Source code for skranger.tree._tree

import numpy as np
from sklearn.tree._tree import csr_matrix


[docs]class Tree: """The low-level tree interface. Tree objects can be accessed using the ``tree_`` attribute on fitted decision tree estimators. Instances of ``Tree`` provide methods and properties describing the underlying structure and attributes of the tree. """ def __init__(self, *, ranger_forest): self.ranger_forest = ranger_forest @property def node_count(self): """The quantity of nodes in the tree.""" return len(self.children_left) @property def capacity(self): """The capacity of the node array.""" return len(self.children_left) @property def children_left(self): """Left children nodes.""" # sklearn uses -1, ranger uses 0 return np.array( [ -1 if n == 0 else n for n in self.ranger_forest["forest"]["child_node_ids"][0][0] ] ) @property def children_right(self): """Right children nodes.""" # sklearn uses -1, ranger uses 0 return np.array( [ -1 if n == 0 else n for n in self.ranger_forest["forest"]["child_node_ids"][0][1] ] ) @property def n_outputs(self): """The quantity of outputs of the tree.""" # single output only return 1 @property def n_classes(self): """The quantity of classes.""" return np.array([self.ranger_forest["n_classes"]])
[docs] def get_depth(self): """Calculate the maximum depth of the tree.""" left = self.children_left right = self.children_right root_node = 0 return self._get_depth(left, right, root_node)
def _get_depth(self, left, right, idx): if left[idx] == -1: return 0 return 1 + max( self._get_depth(left, right, left[idx]), self._get_depth(left, right, right[idx]), )
[docs] def get_n_leaves(self): """Calculate the number of leaves of the tree.""" left = self.children_left right = self.children_right root_node = 0 return self._get_n_leaves(left, right, root_node)
def _get_n_leaves(self, left, right, idx): if left[idx] == -1: return 1 return self._get_n_leaves(left, right, left[idx]) + self._get_n_leaves( left, right, right[idx] )
[docs] def apply(self, X): """Calculate the leaf index for each sample. :param array2d X: training input features """ return np.apply_along_axis(self._apply, 1, X)
def _apply(self, x, idx=None): if idx is None: idx = 0 return self._apply(x, idx) if self.children_left[idx] == -1: return idx varid = self.feature[idx] val = self.threshold[idx] if x[varid] <= val: idx = self.children_left[idx] else: idx = self.children_right[idx] return self._apply(x, idx)
[docs] def decision_path(self, X): """Calculate the decision path through the tree for each sample. :param array2d X: training input features """ if hasattr(X, "values"): # pd.Dataframe Xvalues = X.values else: Xvalues = X paths = [self._decision_path(x) for x in Xvalues] rows = [np.ones(len(p), dtype=int) * idx for idx, p in enumerate(paths)] rows = np.concatenate(rows, axis=0) cols = np.concatenate(paths, axis=0) data = np.ones(len(rows), dtype=int) return csr_matrix((data, (rows, cols)))
def _decision_path(self, x, idx=None): if idx is None: idx = 0 return [idx] + self._decision_path(x, idx) if self.children_left[idx] == -1: return [] varid = self.feature[idx] val = self.threshold[idx] if x[varid] <= val: idx = self.children_left[idx] else: idx = self.children_right[idx] return [idx] + self._decision_path(x, idx) @property def max_depth(self): """Max depth of the tree.""" return self.get_depth() @property def feature(self): """Variables on which nodes are split.""" # sklearn uses -2, ranger uses 0 return np.array( [ -2 if n == 0 else v for n, v in zip( self.ranger_forest["forest"]["child_node_ids"][0][0], self.ranger_forest["forest"]["split_var_ids"][0], ) ] ) @property def threshold(self): """Threshold values on which nodes are split.""" # sklearn uses -2, ranger uses 0 return np.array( [ -2 if n == 0 else v for n, v in zip( self.ranger_forest["forest"]["child_node_ids"][0][0], self.ranger_forest["forest"]["split_values"][0], ) ] ) @property def n_node_samples(self): """The number of samples reaching each node.""" n_samples = [ len(node) if node else 0 for node in self.ranger_forest["forest"]["leaf_samples"][0] ] self._get_n_node_samples(self.children_left, self.children_right, 0, n_samples) return np.array(n_samples) def _get_n_node_samples(self, left, right, idx, n_samples): left_n_node_samples = ( n_samples[idx] if left[idx] == -1 else self._get_n_node_samples(left, right, left[idx], n_samples) ) right_n_node_samples = ( n_samples[idx] if right[idx] == -1 else self._get_n_node_samples(left, right, right[idx], n_samples) ) n_samples[idx] = left_n_node_samples + right_n_node_samples return n_samples[idx] @property def weighted_n_node_samples(self): """The sum of the weights of the samples reaching each node.""" weighted_n_samples = self.ranger_forest["forest"]["leaf_weights"][0].copy() self._get_n_node_samples( self.children_left, self.children_right, 0, weighted_n_samples, ) return np.array(weighted_n_samples) @property def value(self): """The constant prediction value of each node.""" values = self.ranger_forest["forest"]["node_values"][0] return np.reshape(values, (len(values), 1, self.n_classes[0]))