Demo entry 2823831

DecisionTree

   

Submitted by anonymous on Oct 06, 2015 at 17:51
Language: Python. Code size: 3.2 kB.

class DecisionTree(object):
  '''
  A decision tree.
  '''

  def __init__(self, data):
    self.root = self.__build__(data, set(range(len(data[0][0]))), DecisionNode())

  def __build__(self, data, features, node):
    '''
    Build a decision tree.
    '''
    if self.__homogeneous__(data):
      node.class_counts = { data[0][1]: len(data) }
      node.label = data[0][1]
      return node
    if len(features) == 0:
      return self.__vote__(data, node)
    best = max(features, key = lambda idx: self.__gain__(data, idx))
    if self.__gain__(data, best) == 0:
      return self.__vote__(data, node)
    node.split_feature = best
    for subset in self.__split__(data, best):
      child = DecisionNode(parent = node)
      child.split_value = subset[0][0][best]
      node.children.append(child)
      self.__build__(subset, features - set([best]), child)
    return node

  def __distribution__(self, data):
    '''
    Create a probability distribution of the labels over the data.
    '''
    data_length = len(data)
    labels = [ label for _, label in data ]
    labels_unique = set(labels)
    dist = []
    for label in labels_unique:
      dist.append(float(labels.count(label)) / data_length)
    return dist

  def __entropy__(self, dist):
    '''
    Compute the Shannon entropy of a given probability distribution.
    '''
    return -sum([p * math.log(p, 2) for p in dist])

  def __gain__(self, data, idx):
    '''
    Compute the expected gain from splitting the data along all
    possible values of feature.
    '''
    dist = self.__distribution__(data)
    gain = self.__entropy__(dist)
    for subset in self.__split__(data, idx):
      dist = self.__distribution__(subset)
      gain -= self.__entropy__(dist)
    return gain

  def __homogeneous__(self, data):
    '''
    Return True if the data have the same label, and False otherwise.
    '''
    return len(set([label for _, label in data])) <= 1

  def __split__(self, data, idx):
    '''
    Iterate over the subsets of data corresponding to each value
    of the feature at the provided index.
    '''
    values = [ point[idx] for point, _ in data ]
    for value in set(values):
      subset = [ (point, label) for point, label in data \
                 if point[idx] == value ]
      yield subset

  def __vote__(self, data, node):
    '''
    Label node with the majority of the class labels in the given data set.
    '''
    labels = [label for _, label in data]
    labels_unique = set(labels)
    choice = max(labels_unique, key = labels.count)
    node.class_counts = {
      label: labels.count(label) for label in labels_unique
    }
    node.label = choice
    return node

  def predict(self, data):
    '''
    Make a prediction on new data.
    '''
    def p(node, data):
      if len(node.children) == 0:
        return node.label
      else:
        match = [ child for child in node.children if \
                  child.split_value == \
                  data[node.split_feature] ]
      try:
        return p(match[0], data)
      except Exception as e:
        print 'Error: ', data
        print e
    return p(self.root, data)

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).