I've decided to implement the ID3 Decision Tree algorithm in Python based on what I've learned from George F. Luger's textbook on AI (and other secondary readings). As far as I know, the code is working correctly:
import math
from random import choice
from collections import defaultdict, namedtuple
class TreeNode:
"""Used to implement the tree structure."""
def __init__(self, type = None, value = None):
#type may be "property" or "leaf".
self.type = type
#The edges of the tree are represented as dict keys
#and the nodes as the dict values.
self.branches = dict()
#This variable contains the name of the property represented by the node
#or the label, if it's a leaf node.
self.value = value
class DecisionTree:
"""This class implements a simple Decision Tree for classification tasks.
After providing a dataset, the train method should be used to create the tree.
After that, the classify method may be used to classify a new example
by traversing the tree."""
def __init__(self, dataset):
"""The dataset must be a sequence of named tuples representing each
sample of the training set, which must contain the values for each attribute
and also their label. Attributes might have any name but the samples must
have a 'label' attribute."""
self.dataset = dataset
self.tree = None
self.labels = None
def train(self):
"""Create the decision tree based on the dataset and stores it in self.tree"""
if len(self.dataset) == 0:
print("The dataset was not provided.")
return False
#Find out the list of properties (assuming all examples have the same structure)
properties = list(dataset[0]._fields)
properties.remove("label")
#Find out the possible labels for each sample.
self.labels = set([getattr(x, "label") for x in self.dataset])
self.tree = self._induce_tree(self.dataset, properties)
def classify(self, example):
"""Return a string (or a list of strings) containing the label
(or possible labels) of the given example."""
tree = self.tree
while tree.type != "leaf":
tree = tree.branches[getattr(example, tree.value)]
return tree.value
def _calc_entropy(self, probabilities):
"""Return the entropy of a given random variable given the
probabilities (in the form of a list/tuple) of each of its possible outcomes"""
entropy = 0
for p in probabilities:
entropy -= 0 if p == 0 else p * math.log(p, 2)
return entropy
def _calc_expected_entropy(self, partition):
"""Return the expected information needed to complete the tree"""
#Get the total number of elements (population)
total_pop = 0
for p in partition.values():
total_pop += len(p)
expectation = 0
for p in partition.values():
p_pop = len(p)
p_labels = [getattr(s, "label") for s in p]
#Get the probability of each label ocurrency in the given
#partition: No. of elements with the label divided by
#the total no. of elements of the partitions.
probs = []
for label in self.labels:
probs.append(p_labels.count(label)/p_pop)
#Expected information needed =
#Sigma |Partition_i| / Total population * Entropy of Partition_i
#for i from 1 to n (no. of partitions).
expectation += (p_pop/total_pop) * self._calc_entropy(probs)
return expectation
def _choose_property(self, example_set, properties):
rank = list()
for p in properties:
#Partition the example set in groups of samples that
#share the same value for the property p.
partition = defaultdict(lambda: list())
for s in example_set:
partition[getattr(s, p)].append(s)
rank.append((p,
self._calc_expected_entropy(partition),
partition))
#Sort by expected entropy in increasing order.
rank.sort(key = lambda x: x[1])
#Since rank is a list of tuples containing the property,
#the expected entropy and partition for a subtree with that property
#rank[0][0] in the sorted rank will be the property
#with the least expected entropy (or with the most significant
#information gain) and rank[0][2] will be the partition of
#the example set for that property.
return (rank[0][0], rank[0][2])
def _induce_tree(self, example_set, properties):
"""Recursive algorithm for inducing a decision tree based in ID3
algorithm as explained by George F. Luger's textbook (2009).
example_set: a list named tuples containing the samples
properties: a list of strings containing the remaining properties
to be chosen in the tree induction"""
#If all samples belong to the same class, then
#produce a leaf node for that class (label).
labels = set([getattr(s, "label") for s in example_set])
if len(labels) == 1:
return TreeNode(type = "leaf", value = list(labels)[0])
#Else if there are no more properties to evaluate
#but there are still examples with different labels
#pick one at random and produce a leaf node with it.
if not properties:
return TreeNode(type = "leaf", value = choice(example_set).getatrr("label"))
#Else, chose a property and make it the root of a new subtree.
p, partition = self._choose_property(example_set, properties)
new_root = TreeNode(type = "property", value = p)
new_properties = list(properties)
new_properties.remove(p)
#Create a new branch in the subtree for each partition.
for value, block in partition.items():
new_root.branches[value] = self._induce_tree(block, new_properties)
return new_root
#This is just auxiliary code...
def generate_pydot_graph(graph, node, edge_name = "none"):
"""A recursive function to convert the generated tree
into a pydot graph"""
node_name = edge_name + '-' + node.value
if node.type == "leaf":
shape = "box"
else:
shape = "ellipse"
graph.add_node(pydot.Node(node_name, label = node.value, shape = shape))
if node.type == "property":
for edge, sibling in node.branches.items():
sibling_name = edge+'-'+sibling.value
graph.add_edge(pydot.Edge(node_name, sibling_name, label = edge))
generate_pydot_graph(graph, sibling, edge)
if __name__ == "__main__":
Sample = namedtuple("Sample", ['label', 'credit_history', 'debit', 'collateral', 'income'])
dataset = list()
dataset.append(Sample('high', 'bad', 'high', 'none', '0-15k'))
dataset.append(Sample('high', 'unknown', 'high', 'none', '15-35k'))
dataset.append(Sample('moderate', 'unknown', 'low', 'none', '15-35k'))
dataset.append(Sample('high', 'unknown', 'low', 'none', '0-15k'))
dataset.append(Sample('low', 'unknown', 'low', 'none', '35k+'))
dataset.append(Sample('low', 'unknown', 'low', 'adequate', '35k+'))
dataset.append(Sample('high', 'bad', 'low', 'none', '0-15k'))
dataset.append(Sample('moderate', 'bad', 'low', 'adequate', '35k+'))
dataset.append(Sample('low', 'good', 'low', 'none', '35k+'))
dataset.append(Sample('low', 'good', 'high', 'adequate', '35k+'))
dataset.append(Sample('high', 'good', 'high', 'none', '0-15k'))
dataset.append(Sample('moderate', 'good', 'high', 'none', '15-35k'))
dataset.append(Sample('low', 'good', 'high', 'none', '35k+'))
dataset.append(Sample('high', 'bad', 'high', 'none', '15-35k'))
decision_tree = DecisionTree(dataset)
decision_tree.train()
#Testing the tree...
inst_fields = list(Sample._fields)
inst_fields.remove("label")
NewInstance = namedtuple("NewInstance", inst_fields)
tests = list()
tests.append(NewInstance("unknown", "low", "none", "15-35k")) #moderate
tests.append(NewInstance("good", "node", "none", "0-15k")) #high
tests.append(NewInstance("unknown", "high", "adequate", "15-35k")) #high
for t in tests:
print(decision_tree.classify(t))
#Generate a PNG image with a graphical representation of the tree
#to compare with the one in Luger's book :)
import pydot
graph = pydot.Dot("decision_tree", graph_type = "graph")
generate_pydot_graph(graph, decision_tree.tree)
graph.write_png("output.png")
I'm particularly interested in feedback on the choices I've made concerning the data structures used (sequences of namedtuples for the dataset, dicts for partitions and so on). I'd also like to know if I commented and documented the code properly and if there are "more pythonic" ways of doing things.
My main goal was to implement the algorithm in a way that it would be easy to read and understand, so performance was not a primary concern of mine. Nonetheless, I'd appreciate tips on that too. Thank you!
1 Answer 1
You need to type-hint your code. Some of this will be made difficult by the dynamic nature of your field lookups, but you can get 90% of the way there.
property
and type
are poor variable name choices because they shadow built-ins.
Don't default type
and value
as None
; you always know what these are going to be so there's no point in leaking nullity into those variables.
dict()
can just be the literal {}
.
DecisionTree
is troubled by the separation of train
from its constructor. There's basically never utility in separating these two methods, since it's expected that train()
will always be called after construction, and in the interim the object is in an invalid state. Combining them reveals that the class doesn't actually need to hold onto a dataset at all.
Don't return False
if the dataset is missing; raise
an exception.
Your self.labels = set([
is better represented by a set comprehension.
_calc_entropy
can convert its loop into a sum()
with a generator. Also, don't unconditionally subtract with something that might be 0; instead conditionally subtract.
Don't getattr(s, "label")
; just write s.label
.
Don't write defaultdict(lambda: list())
; list
itself is callable and does not need a lambda.
(rank[0][0], rank[0][2])
will be made more clear by tuple-unpacking rank[0]
.
Avoid declaring your Sample
and NewInstance
in the local namespace; declare them globally with explicit types.
Don't import pydot
locally; do this at the top.
Your __main__
guard is not enough. All of those symbols are still global, and need to be moved into a function. When you do that, a bug is revealed: one of your classes had a reference to the global dataset
when it should have used self.dataset
.
Don't sort
followed by [0]
; this is equivalent to min()
.
Suggested
import math
from collections import defaultdict
from random import choice
from typing import Collection, Iterable, Literal, NamedTuple, Sequence
import pydot
class Sample(NamedTuple):
label: str
credit_history: str
debit: str
collateral: str
income: str
class NewInstance(NamedTuple):
credit_history: str
debit: str
collateral: str
income: str
class TreeNode:
def __init__(self, type: Literal['property', 'leaf'], value: str) -> None:
self.type = type
# The edges of the tree are represented as dict keys
# and the nodes as the dict values.
self.branches: dict[str, TreeNode] = {}
# This variable contains the name of the property represented by the node
# or the label, if it's a leaf node.
self.value = value
class DecisionTree:
"""This class implements a simple Decision Tree for classification tasks.
After providing a dataset, the train method should be used to create the tree.
After that, the classify method may be used to classify a new example
by traversing the tree."""
def __init__(self, dataset: Sequence[Sample]) -> None:
"""The dataset must be a sequence of named tuples representing each
sample of the training set, which must contain the values for each attribute
and also their label. Attributes might have any name but the samples must
have a 'label' attribute."""
if len(dataset) == 0:
raise ValueError("The dataset was not provided.")
# Find out the list of properties (assuming all examples have the same structure)
properties = [f for f in dataset[0]._fields if f != 'label']
# Find out the possible labels for each sample.
self.labels = {sample.label for sample in dataset}
# Create the decision tree based on the dataset and stores it in self.tree
self.tree = self._induce_tree(dataset, properties)
def classify(self, example: NewInstance) -> str:
"""Return a string (or a list of strings) containing the label
(or possible labels) of the given example."""
tree = self.tree
while tree.type != "leaf":
tree = tree.branches[getattr(example, tree.value)]
return tree.value
@staticmethod
def _calc_entropy(probabilities: Iterable[float]) -> float:
"""Return the entropy of a given random variable given the
probabilities (in the form of a list/tuple) of each of its possible outcomes"""
entropy = -sum(
p * math.log(p, 2)
for p in probabilities
if p != 0
)
return entropy
def _calc_expected_entropy(self, partition: Collection[Collection[Sample]]) -> float:
"""Return the expected information needed to complete the tree"""
# Get the total number of elements (population)
total_pop = sum(len(p) for p in partition)
expectation = 0
for p in partition:
p_pop = len(p)
p_labels = [s.label for s in p]
# Get the probability of each label occurrence in the given
# partition: No. of elements with the label divided by
# the total no. of elements of the partitions.
probs = [
p_labels.count(label) / p_pop
for label in self.labels
]
# Expected information needed =
# Sigma |Partition_i| / Total population * Entropy of Partition_i
# for i from 1 to n (no. of partitions).
expectation += p_pop / total_pop * self._calc_entropy(probs)
return expectation
def _choose_property(
self,
example_set: Collection[Sample],
properties: Iterable[str],
) -> tuple[
str, # property
dict[str, list[Sample]], # partition
]:
rank = []
for p in properties:
# Partition the example set in groups of samples that
# share the same value for the property p.
partition = defaultdict(list)
for s in example_set:
partition[getattr(s, p)].append(s)
rank.append((p,
self._calc_expected_entropy(partition.values()),
partition))
# Sort by expected entropy in increasing order.
# Since rank is a list of tuples containing the property,
# the expected entropy and partition for a subtree with that property
# rank[0][0] in the sorted rank will be the property
# with the least expected entropy (or with the most significant
# information gain) and rank[0][2] will be the partition of
# the example set for that property.
property, entropy, partition = min(rank, key=lambda x: x[1])
return property, partition
def _induce_tree(
self,
example_set: Sequence[Sample],
properties: Collection[str],
) -> TreeNode:
"""Recursive algorithm for inducing a decision tree based in ID3
algorithm as explained by George F. Luger's textbook (2009).
example_set: a list named tuples containing the samples
properties: a list of strings containing the remaining properties
to be chosen in the tree induction"""
# If all samples belong to the same class, then
# produce a leaf node for that class (label).
labels = {s.label for s in example_set}
if len(labels) == 1:
only_label, = labels
return TreeNode(type="leaf", value=only_label)
# Else if there are no more properties to evaluate
# but there are still examples with different labels
# pick one at random and produce a leaf node with it.
if not properties:
return TreeNode(type="leaf", value=choice(example_set).label)
# Else, chose a property and make it the root of a new subtree.
p, partition = self._choose_property(example_set, properties)
new_root = TreeNode(type="property", value=p)
new_properties = [prop for prop in properties if prop != p]
# Create a new branch in the subtree for each partition.
for value, block in partition.items():
new_root.branches[value] = self._induce_tree(block, new_properties)
return new_root
def generate_pydot_graph(graph: pydot.Dot, node: TreeNode, edge_name: str = "none") -> None:
"""A recursive function to convert the generated tree
into a pydot graph"""
node_name = edge_name + '-' + node.value
if node.type == "leaf":
shape = "box"
else:
shape = "ellipse"
graph.add_node(pydot.Node(node_name, label=node.value, shape=shape))
if node.type == "property":
for edge, sibling in node.branches.items():
sibling_name = edge + '-' + sibling.value
graph.add_edge(pydot.Edge(node_name, sibling_name, label=edge))
generate_pydot_graph(graph, sibling, edge)
def main() -> None:
dataset = (
Sample( 'high', 'bad', 'high', 'none', '0-15k'),
Sample( 'high', 'unknown', 'high', 'none', '15-35k'),
Sample('moderate', 'unknown', 'low', 'none', '15-35k'),
Sample( 'high', 'unknown', 'low', 'none', '0-15k'),
Sample( 'low', 'unknown', 'low', 'none', '35k+'),
Sample( 'low', 'unknown', 'low', 'adequate', '35k+'),
Sample( 'high', 'bad', 'low', 'none', '0-15k'),
Sample('moderate', 'bad', 'low', 'adequate', '35k+'),
Sample( 'low', 'good', 'low', 'none', '35k+'),
Sample( 'low', 'good', 'high', 'adequate', '35k+'),
Sample( 'high', 'good', 'high', 'none', '0-15k'),
Sample('moderate', 'good', 'high', 'none', '15-35k'),
Sample( 'low', 'good', 'high', 'none', '35k+'),
Sample( 'high', 'bad', 'high', 'none', '15-35k'),
)
decision_tree = DecisionTree(dataset)
tests = (
NewInstance("unknown", "low", "none", "15-35k"), # moderate
NewInstance( "good", "node", "none", "0-15k"), # high
NewInstance("unknown", "high", "adequate", "15-35k"), # high
)
for t in tests:
print(decision_tree.classify(t))
# Generate a PNG image with a graphical representation of the tree
# to compare with the one in Luger's book :)
graph = pydot.Dot("decision_tree", graph_type="graph")
generate_pydot_graph(graph, decision_tree.tree)
graph.write_png("output.png")
if __name__ == "__main__":
main()