1
\$\begingroup\$

I've decided to implement the ID3 Decision Tree algorithm in Python based on what I've learned from George F. Luger's textbook on AI (and other secondary readings). As far as I know, the code is working correctly:

import math
from random import choice
from collections import defaultdict, namedtuple
class TreeNode:
 """Used to implement the tree structure."""
 def __init__(self, type = None, value = None):
 #type may be "property" or "leaf". 
 self.type = type
 
 #The edges of the tree are represented as dict keys
 #and the nodes as the dict values.
 self.branches = dict()
 
 #This variable contains the name of the property represented by the node
 #or the label, if it's a leaf node.
 self.value = value
class DecisionTree:
 """This class implements a simple Decision Tree for classification tasks.
 After providing a dataset, the train method should be used to create the tree. 
 After that, the classify method may be used to classify a new example 
 by traversing the tree."""
 
 
 def __init__(self, dataset):
 """The dataset must be a sequence of named tuples representing each
 sample of the training set, which must contain the values for each attribute
 and also their label. Attributes might have any name but the samples must 
 have a 'label' attribute."""
 
 self.dataset = dataset
 self.tree = None
 self.labels = None 
 
 def train(self):
 """Create the decision tree based on the dataset and stores it in self.tree"""
 if len(self.dataset) == 0:
 print("The dataset was not provided.")
 return False
 #Find out the list of properties (assuming all examples have the same structure)
 properties = list(dataset[0]._fields)
 properties.remove("label")
 
 #Find out the possible labels for each sample. 
 self.labels = set([getattr(x, "label") for x in self.dataset])
 self.tree = self._induce_tree(self.dataset, properties)
 
 def classify(self, example):
 """Return a string (or a list of strings) containing the label
 (or possible labels) of the given example."""
 tree = self.tree
 while tree.type != "leaf":
 tree = tree.branches[getattr(example, tree.value)]
 return tree.value
 def _calc_entropy(self, probabilities):
 """Return the entropy of a given random variable given the
 probabilities (in the form of a list/tuple) of each of its possible outcomes"""
 
 entropy = 0
 for p in probabilities:
 entropy -= 0 if p == 0 else p * math.log(p, 2)
 return entropy
 def _calc_expected_entropy(self, partition):
 """Return the expected information needed to complete the tree"""
 
 #Get the total number of elements (population)
 total_pop = 0
 for p in partition.values():
 total_pop += len(p)
 expectation = 0
 for p in partition.values():
 p_pop = len(p)
 p_labels = [getattr(s, "label") for s in p]
 
 #Get the probability of each label ocurrency in the given
 #partition: No. of elements with the label divided by
 #the total no. of elements of the partitions.
 probs = []
 for label in self.labels:
 probs.append(p_labels.count(label)/p_pop)
 #Expected information needed =
 #Sigma |Partition_i| / Total population * Entropy of Partition_i
 #for i from 1 to n (no. of partitions).
 expectation += (p_pop/total_pop) * self._calc_entropy(probs)
 return expectation
 def _choose_property(self, example_set, properties):
 rank = list()
 for p in properties:
 #Partition the example set in groups of samples that 
 #share the same value for the property p. 
 partition = defaultdict(lambda: list())
 for s in example_set:
 partition[getattr(s, p)].append(s)
 rank.append((p, 
 self._calc_expected_entropy(partition), 
 partition)) 
 
 #Sort by expected entropy in increasing order.
 rank.sort(key = lambda x: x[1])
 #Since rank is a list of tuples containing the property, 
 #the expected entropy and partition for a subtree with that property
 #rank[0][0] in the sorted rank will be the property
 #with the least expected entropy (or with the most significant 
 #information gain) and rank[0][2] will be the partition of
 #the example set for that property.
 return (rank[0][0], rank[0][2])
 
 def _induce_tree(self, example_set, properties):
 """Recursive algorithm for inducing a decision tree based in ID3
 algorithm as explained by George F. Luger's textbook (2009).
 
 example_set: a list named tuples containing the samples
 properties: a list of strings containing the remaining properties
 to be chosen in the tree induction"""
 
 #If all samples belong to the same class, then
 #produce a leaf node for that class (label). 
 labels = set([getattr(s, "label") for s in example_set])
 if len(labels) == 1:
 return TreeNode(type = "leaf", value = list(labels)[0])
 
 #Else if there are no more properties to evaluate
 #but there are still examples with different labels
 #pick one at random and produce a leaf node with it.
 if not properties:
 return TreeNode(type = "leaf", value = choice(example_set).getatrr("label"))
 
 #Else, chose a property and make it the root of a new subtree.
 p, partition = self._choose_property(example_set, properties)
 new_root = TreeNode(type = "property", value = p)
 new_properties = list(properties)
 new_properties.remove(p)
 
 #Create a new branch in the subtree for each partition. 
 for value, block in partition.items():
 new_root.branches[value] = self._induce_tree(block, new_properties)
 return new_root
#This is just auxiliary code...
def generate_pydot_graph(graph, node, edge_name = "none"):
 """A recursive function to convert the generated tree
 into a pydot graph"""
 node_name = edge_name + '-' + node.value
 if node.type == "leaf":
 shape = "box"
 else:
 shape = "ellipse"
 graph.add_node(pydot.Node(node_name, label = node.value, shape = shape))
 if node.type == "property":
 for edge, sibling in node.branches.items():
 sibling_name = edge+'-'+sibling.value
 graph.add_edge(pydot.Edge(node_name, sibling_name, label = edge))
 generate_pydot_graph(graph, sibling, edge)
 
if __name__ == "__main__":
 Sample = namedtuple("Sample", ['label', 'credit_history', 'debit', 'collateral', 'income'])
 dataset = list()
 dataset.append(Sample('high', 'bad', 'high', 'none', '0-15k'))
 dataset.append(Sample('high', 'unknown', 'high', 'none', '15-35k'))
 dataset.append(Sample('moderate', 'unknown', 'low', 'none', '15-35k'))
 dataset.append(Sample('high', 'unknown', 'low', 'none', '0-15k'))
 dataset.append(Sample('low', 'unknown', 'low', 'none', '35k+'))
 dataset.append(Sample('low', 'unknown', 'low', 'adequate', '35k+'))
 dataset.append(Sample('high', 'bad', 'low', 'none', '0-15k'))
 dataset.append(Sample('moderate', 'bad', 'low', 'adequate', '35k+'))
 dataset.append(Sample('low', 'good', 'low', 'none', '35k+'))
 dataset.append(Sample('low', 'good', 'high', 'adequate', '35k+'))
 dataset.append(Sample('high', 'good', 'high', 'none', '0-15k'))
 dataset.append(Sample('moderate', 'good', 'high', 'none', '15-35k'))
 dataset.append(Sample('low', 'good', 'high', 'none', '35k+'))
 dataset.append(Sample('high', 'bad', 'high', 'none', '15-35k'))
 decision_tree = DecisionTree(dataset)
 decision_tree.train()
 
 #Testing the tree... 
 inst_fields = list(Sample._fields)
 inst_fields.remove("label")
 NewInstance = namedtuple("NewInstance", inst_fields)
 
 tests = list()
 tests.append(NewInstance("unknown", "low", "none", "15-35k")) #moderate
 tests.append(NewInstance("good", "node", "none", "0-15k")) #high
 tests.append(NewInstance("unknown", "high", "adequate", "15-35k")) #high
 for t in tests:
 print(decision_tree.classify(t))
 
 #Generate a PNG image with a graphical representation of the tree 
 #to compare with the one in Luger's book :)
 import pydot
 graph = pydot.Dot("decision_tree", graph_type = "graph")
 generate_pydot_graph(graph, decision_tree.tree)
 graph.write_png("output.png") 

I'm particularly interested in feedback on the choices I've made concerning the data structures used (sequences of namedtuples for the dataset, dicts for partitions and so on). I'd also like to know if I commented and documented the code properly and if there are "more pythonic" ways of doing things.

My main goal was to implement the algorithm in a way that it would be easy to read and understand, so performance was not a primary concern of mine. Nonetheless, I'd appreciate tips on that too. Thank you!

asked May 3, 2022 at 4:07
\$\endgroup\$

1 Answer 1

1
\$\begingroup\$

You need to type-hint your code. Some of this will be made difficult by the dynamic nature of your field lookups, but you can get 90% of the way there.

property and type are poor variable name choices because they shadow built-ins.

Don't default type and value as None; you always know what these are going to be so there's no point in leaking nullity into those variables.

dict() can just be the literal {}.

DecisionTree is troubled by the separation of train from its constructor. There's basically never utility in separating these two methods, since it's expected that train() will always be called after construction, and in the interim the object is in an invalid state. Combining them reveals that the class doesn't actually need to hold onto a dataset at all.

Don't return False if the dataset is missing; raise an exception.

Your self.labels = set([ is better represented by a set comprehension.

_calc_entropy can convert its loop into a sum() with a generator. Also, don't unconditionally subtract with something that might be 0; instead conditionally subtract.

Don't getattr(s, "label"); just write s.label.

Don't write defaultdict(lambda: list()); list itself is callable and does not need a lambda.

(rank[0][0], rank[0][2]) will be made more clear by tuple-unpacking rank[0].

Avoid declaring your Sample and NewInstance in the local namespace; declare them globally with explicit types.

Don't import pydot locally; do this at the top.

Your __main__ guard is not enough. All of those symbols are still global, and need to be moved into a function. When you do that, a bug is revealed: one of your classes had a reference to the global dataset when it should have used self.dataset.

Don't sort followed by [0]; this is equivalent to min().

Suggested

import math
from collections import defaultdict
from random import choice
from typing import Collection, Iterable, Literal, NamedTuple, Sequence
import pydot
class Sample(NamedTuple):
 label: str
 credit_history: str
 debit: str
 collateral: str
 income: str
class NewInstance(NamedTuple):
 credit_history: str
 debit: str
 collateral: str
 income: str
class TreeNode:
 def __init__(self, type: Literal['property', 'leaf'], value: str) -> None:
 self.type = type
 # The edges of the tree are represented as dict keys
 # and the nodes as the dict values.
 self.branches: dict[str, TreeNode] = {}
 # This variable contains the name of the property represented by the node
 # or the label, if it's a leaf node.
 self.value = value
class DecisionTree:
 """This class implements a simple Decision Tree for classification tasks.
 After providing a dataset, the train method should be used to create the tree.
 After that, the classify method may be used to classify a new example
 by traversing the tree."""
 def __init__(self, dataset: Sequence[Sample]) -> None:
 """The dataset must be a sequence of named tuples representing each
 sample of the training set, which must contain the values for each attribute
 and also their label. Attributes might have any name but the samples must
 have a 'label' attribute."""
 if len(dataset) == 0:
 raise ValueError("The dataset was not provided.")
 # Find out the list of properties (assuming all examples have the same structure)
 properties = [f for f in dataset[0]._fields if f != 'label']
 # Find out the possible labels for each sample.
 self.labels = {sample.label for sample in dataset}
 # Create the decision tree based on the dataset and stores it in self.tree
 self.tree = self._induce_tree(dataset, properties)
 def classify(self, example: NewInstance) -> str:
 """Return a string (or a list of strings) containing the label
 (or possible labels) of the given example."""
 tree = self.tree
 while tree.type != "leaf":
 tree = tree.branches[getattr(example, tree.value)]
 return tree.value
 @staticmethod
 def _calc_entropy(probabilities: Iterable[float]) -> float:
 """Return the entropy of a given random variable given the
 probabilities (in the form of a list/tuple) of each of its possible outcomes"""
 entropy = -sum(
 p * math.log(p, 2)
 for p in probabilities
 if p != 0
 )
 return entropy
 def _calc_expected_entropy(self, partition: Collection[Collection[Sample]]) -> float:
 """Return the expected information needed to complete the tree"""
 # Get the total number of elements (population)
 total_pop = sum(len(p) for p in partition)
 expectation = 0
 for p in partition:
 p_pop = len(p)
 p_labels = [s.label for s in p]
 # Get the probability of each label occurrence in the given
 # partition: No. of elements with the label divided by
 # the total no. of elements of the partitions.
 probs = [
 p_labels.count(label) / p_pop
 for label in self.labels
 ]
 # Expected information needed =
 # Sigma |Partition_i| / Total population * Entropy of Partition_i
 # for i from 1 to n (no. of partitions).
 expectation += p_pop / total_pop * self._calc_entropy(probs)
 return expectation
 def _choose_property(
 self,
 example_set: Collection[Sample],
 properties: Iterable[str],
 ) -> tuple[
 str, # property
 dict[str, list[Sample]], # partition
 ]:
 rank = []
 for p in properties:
 # Partition the example set in groups of samples that
 # share the same value for the property p.
 partition = defaultdict(list)
 for s in example_set:
 partition[getattr(s, p)].append(s)
 rank.append((p,
 self._calc_expected_entropy(partition.values()),
 partition))
 # Sort by expected entropy in increasing order.
 # Since rank is a list of tuples containing the property,
 # the expected entropy and partition for a subtree with that property
 # rank[0][0] in the sorted rank will be the property
 # with the least expected entropy (or with the most significant
 # information gain) and rank[0][2] will be the partition of
 # the example set for that property.
 property, entropy, partition = min(rank, key=lambda x: x[1])
 return property, partition
 def _induce_tree(
 self,
 example_set: Sequence[Sample],
 properties: Collection[str],
 ) -> TreeNode:
 """Recursive algorithm for inducing a decision tree based in ID3
 algorithm as explained by George F. Luger's textbook (2009).
 example_set: a list named tuples containing the samples
 properties: a list of strings containing the remaining properties
 to be chosen in the tree induction"""
 # If all samples belong to the same class, then
 # produce a leaf node for that class (label).
 labels = {s.label for s in example_set}
 if len(labels) == 1:
 only_label, = labels
 return TreeNode(type="leaf", value=only_label)
 # Else if there are no more properties to evaluate
 # but there are still examples with different labels
 # pick one at random and produce a leaf node with it.
 if not properties:
 return TreeNode(type="leaf", value=choice(example_set).label)
 # Else, chose a property and make it the root of a new subtree.
 p, partition = self._choose_property(example_set, properties)
 new_root = TreeNode(type="property", value=p)
 new_properties = [prop for prop in properties if prop != p]
 # Create a new branch in the subtree for each partition.
 for value, block in partition.items():
 new_root.branches[value] = self._induce_tree(block, new_properties)
 return new_root
def generate_pydot_graph(graph: pydot.Dot, node: TreeNode, edge_name: str = "none") -> None:
 """A recursive function to convert the generated tree
 into a pydot graph"""
 node_name = edge_name + '-' + node.value
 if node.type == "leaf":
 shape = "box"
 else:
 shape = "ellipse"
 graph.add_node(pydot.Node(node_name, label=node.value, shape=shape))
 if node.type == "property":
 for edge, sibling in node.branches.items():
 sibling_name = edge + '-' + sibling.value
 graph.add_edge(pydot.Edge(node_name, sibling_name, label=edge))
 generate_pydot_graph(graph, sibling, edge)
def main() -> None:
 dataset = (
 Sample( 'high', 'bad', 'high', 'none', '0-15k'),
 Sample( 'high', 'unknown', 'high', 'none', '15-35k'),
 Sample('moderate', 'unknown', 'low', 'none', '15-35k'),
 Sample( 'high', 'unknown', 'low', 'none', '0-15k'),
 Sample( 'low', 'unknown', 'low', 'none', '35k+'),
 Sample( 'low', 'unknown', 'low', 'adequate', '35k+'),
 Sample( 'high', 'bad', 'low', 'none', '0-15k'),
 Sample('moderate', 'bad', 'low', 'adequate', '35k+'),
 Sample( 'low', 'good', 'low', 'none', '35k+'),
 Sample( 'low', 'good', 'high', 'adequate', '35k+'),
 Sample( 'high', 'good', 'high', 'none', '0-15k'),
 Sample('moderate', 'good', 'high', 'none', '15-35k'),
 Sample( 'low', 'good', 'high', 'none', '35k+'),
 Sample( 'high', 'bad', 'high', 'none', '15-35k'),
 )
 decision_tree = DecisionTree(dataset)
 tests = (
 NewInstance("unknown", "low", "none", "15-35k"), # moderate
 NewInstance( "good", "node", "none", "0-15k"), # high
 NewInstance("unknown", "high", "adequate", "15-35k"), # high
 )
 for t in tests:
 print(decision_tree.classify(t))
 # Generate a PNG image with a graphical representation of the tree
 # to compare with the one in Luger's book :)
 graph = pydot.Dot("decision_tree", graph_type="graph")
 generate_pydot_graph(graph, decision_tree.tree)
 graph.write_png("output.png")
if __name__ == "__main__":
 main()

Output

output graph

answered May 4, 2022 at 15:43
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.