Manual Regression Tree using Python

Question 1

I wrote a code to create a regression tree for a synthetic train data of size Np. The idea is, first I have the source node (which consists of all set of points) represented as a dictionary {'points':..., 'avg':..., 'left_node':..., 'right_node', 'split_point': }. The left and right nodes are the leafs after the splitting process of the whole data (source). split_point is for information about the best split. Then I loop to get deeper tree with maximum number of nodes specified before, also I set that a node must have more than 5 points in order it can be split.

This way, If I want to predict a point (x',y'), I can just start from source node source and check which region the point lies (left_node or right_node), ..and then continuing down the tree. Because all left_nodes and right_nodes values have the same structure as source....

Also, the form function is used to find the best split, the best split is the one with the smallest form(reg_1, avg1, reg_2, avg2). This is a greedy algorithm to find the best split.

I would like to know better ways to perform it..without external modules. But this is intended to be taught to high school students.

Full code:

import random
import matplotlib.pyplot as plt
def form(region_1, av1, region_2, av2):
 return sum([(i[1]-av1)**2 for i in region_1]) \
 + sum([(i[1]-av2)**2 for i in region_2])
Np = 400
x_data = [abs(random.gauss(5, 0.2) + random.gauss(8, 0.5)) for i in range(Np)]
y_data = [abs(random.gauss(10, 0.2) + random.uniform(0, 10)) for i in range(Np)]
value = [abs(random.gauss(4, 0.5)) for i in range(Np)]
data = [((i,j), k) for i,j,k in zip(x_data, y_data, value)]
fig, ax = plt.subplots()
ax.plot(x_data, y_data, 'o')
fig.show()
###### Splitting from the source node (all data)
source = {'points': data, 'avg': sum([i[1] for i in data])/Np, \
 'split_point': None, 'left_node': None, 'right_node': None}
forms = []
for x in x_data:
 var = x
 region_1 = [j for j in data if j[0][0] <= var]
 region_2 = [j for j in data if j not in region_1]
 if len(region_1) > 0 and len(region_2) > 0:
 av1 = sum([i[1] for i in region_1])/len(region_1)
 av2 = sum([i[1] for i in region_2])/len(region_2)
 f = form(region_1, av1, region_2, av2)
 leaf_1 = {'points': region_1, 'avg': av1}
 leaf_2 = {'points': region_2, 'avg': av2}
 forms.append( (leaf_1, leaf_2, ('x', var), f) )
for y in y_data:
 var = y
 region_1 = [j for j in data if j[0][1] <= var]
 region_2 = [j for j in data if j not in region_1]
 if len(region_1) > 0 and len(region_2) > 0:
 av1 = sum([i[1] for i in region_1])/len(region_1)
 av2 = sum([i[1] for i in region_2])/len(region_2)
 f = form(region_1, av1, region_2, av2)
 leaf_1 = {'points': region_1, 'avg': av1}
 leaf_2 = {'points': region_2, 'avg': av2}
 forms.append( (leaf_1, leaf_2, ('y', var), f) )
sorted_f = sorted(forms, key = lambda x: x[3])
best_split = sorted_f[0]
source['split_point'] = best_split[2]
source['left_node'] = best_split[0]
source['right_node'] = best_split[1]
##### Splitting from the 2 leafs and so on..
leafs = [source['left_node'], source['right_node']]
all_nodes = [leafs[0], leafs[1]]
max_nodes = 1000
while len(all_nodes) <= max_nodes:
 next_leafs = []
 for leaf in leafs:
 if (len(leaf['points']) > 5):
 xx = [i[0][0] for i in leaf['points']]
 yy = [i[0][1] for i in leaf['points']]
 rr = [i[1] for i in leaf['points']]
 vv = [((i,j), k) for i,j,k in zip(xx, yy, rr)]
 forms = []
 for x in xx:
 var = x
 region_1 = [j for j in vv if j[0][0] <= var]
 region_2 = [j for j in vv if j not in region_1]
 if len(region_1) > 0 and len(region_2) > 0:
 av1 = sum([i[1] for i in region_1])/len(region_1)
 av2 = sum([i[1] for i in region_2])/len(region_2)
 f = form(region_1, av1, region_2, av2)
 leaf_1 = {'points': region_1, 'avg': av1}
 leaf_2 = {'points': region_2, 'avg': av2}
 forms.append( (leaf_1, leaf_2, ('x', var), f) )
 for y in yy:
 var = y
 region_1 = [j for j in vv if j[0][1] <= var]
 region_2 = [j for j in vv if j not in region_1]
 if len(region_1) > 0 and len(region_2) > 0:
 av1 = sum([i[1] for i in region_1])/len(region_1)
 av2 = sum([i[1] for i in region_2])/len(region_2)
 f = form(region_1, av1, region_2, av2)
 leaf_1 = {'points': region_1, 'avg': av1}
 leaf_2 = {'points': region_2, 'avg': av2}
 forms.append( (leaf_1, leaf_2, ('y', var), f) )
 sorted_f = sorted(forms, key = lambda x: x[3])
 best_split = sorted_f[0]
 leaf['split_point'] = best_split[2]
 leaf['left_node'] = best_split[0]
 leaf['right_node'] = best_split[1]
 print(leaf['split_point'])
 next_leafs.append(leaf['left_node'])
 next_leafs.append(leaf['right_node'])
 print("\n")
 leafs = next_leafs
 all_nodes.extend(leafs)
 if len(leafs) == 0:
 break

Question 2

Here is my updated version, It looks simpler and better by creating a class, Node.

import random
def formula(region_1, av1, region_2, av2):
 return sum([(i[1]-av1)**2 for i in region_1]) \
 + sum([(i[1]-av2)**2 for i in region_2])
def average(data):
 return sum([d[2] for d in data])/len(data)
Np = 400
x_data = [abs(random.gauss(5, 0.2) + random.gauss(8, 0.5)) for i in range(Np)]
y_data = [abs(random.gauss(10, 0.2) + random.uniform(0, 10)) for i in range(Np)]
z_data = [abs(random.gauss(4, 0.5)) for i in range(Np)]
class Node:
 def __init__(self, x_data, y_data, z_data):
 self.x_data = x_data
 self.y_data = y_data
 self.z_data = z_data
 self.points = [(i, j, k) for i, j, k in zip(x_data, y_data, z_data)]
 self.avg = average(self.points)
 def split(self):
 #Finding the best split:
 candidates = []
 for x in self.x_data:
 split_point = x
 region_1 = [i for i in self.points if i[0] <= split_point]
 region_2 = [i for i in self.points if i not in region_1]
 if (region_1 != []) and (region_2 != []):
 leaf_1 = Node([i[0] for i in region_1], \
 [i[1] for i in region_1], \
 [i[2] for i in region_1])
 leaf_2 = Node([i[0] for i in region_2], \
 [i[1] for i in region_2], \
 [i[2] for i in region_2])
 f = formula(region_1, leaf_1.avg, region_2, leaf_2.avg)
 candidates.append( (leaf_1, leaf_2, ('x', split_point), f) )
 for y in self.y_data:
 split_point = y
 region_1 = [i for i in self.points if i[1] <= split_point]
 region_2 = [i for i in self.points if i not in region_1]
 if (region_1 != []) and (region_2 != []):
 leaf_1 = Node([i[0] for i in region_1], \
 [i[1] for i in region_1], \
 [i[2] for i in region_1])
 leaf_2 = Node([i[0] for i in region_2], \
 [i[1] for i in region_2], \
 [i[2] for i in region_2])
 f = formula(region_1, leaf_1.avg, region_2, leaf_2.avg)
 candidates.append( (leaf_1, leaf_2, ('y', split_point), f) )
 sorted_f = sorted(candidates, key = lambda x: x[3])
 best_split = sorted_f[0]
 #The result:
 self.split_point = best_split[2]
 self.left_node = best_split[0]
 self.right_node = best_split[1]
#Source node and 1st split
source = Node(x_data, y_data, z_data)
source.split()
#Generate Binary Tree
result_nodes = [source.left_node, source.right_node]
all_nodes = [source.left_node, source.right_node]
min_nodes = 1000
min_points = 5
while len(all_nodes) <= min_nodes:
 next_nodes = []
 for node in result_nodes:
 if (len(node.points) > min_points):
 node.split()
 next_nodes.append(node.left_node)
 next_nodes.append(node.right_node)
 result_nodes = next_nodes
 all_nodes.extend(result_nodes)
 if len(result_nodes) == 0:
 break

Redsbefall Redsbefall 1,2122 gold badges11 silver badges23 bronze badges · Answer 1 · 2019-07-03 04:42:41Z

Here is my updated version, It looks simpler and better by creating a class, Node.

import random
def formula(region_1, av1, region_2, av2):
 return sum([(i[1]-av1)**2 for i in region_1]) \
 + sum([(i[1]-av2)**2 for i in region_2])
def average(data):
 return sum([d[2] for d in data])/len(data)
Np = 400
x_data = [abs(random.gauss(5, 0.2) + random.gauss(8, 0.5)) for i in range(Np)]
y_data = [abs(random.gauss(10, 0.2) + random.uniform(0, 10)) for i in range(Np)]
z_data = [abs(random.gauss(4, 0.5)) for i in range(Np)]
class Node:
 def __init__(self, x_data, y_data, z_data):
 self.x_data = x_data
 self.y_data = y_data
 self.z_data = z_data
 self.points = [(i, j, k) for i, j, k in zip(x_data, y_data, z_data)]
 self.avg = average(self.points)
 def split(self):
 #Finding the best split:
 candidates = []
 for x in self.x_data:
 split_point = x
 region_1 = [i for i in self.points if i[0] <= split_point]
 region_2 = [i for i in self.points if i not in region_1]
 if (region_1 != []) and (region_2 != []):
 leaf_1 = Node([i[0] for i in region_1], \
 [i[1] for i in region_1], \
 [i[2] for i in region_1])
 leaf_2 = Node([i[0] for i in region_2], \
 [i[1] for i in region_2], \
 [i[2] for i in region_2])
 f = formula(region_1, leaf_1.avg, region_2, leaf_2.avg)
 candidates.append( (leaf_1, leaf_2, ('x', split_point), f) )
 for y in self.y_data:
 split_point = y
 region_1 = [i for i in self.points if i[1] <= split_point]
 region_2 = [i for i in self.points if i not in region_1]
 if (region_1 != []) and (region_2 != []):
 leaf_1 = Node([i[0] for i in region_1], \
 [i[1] for i in region_1], \
 [i[2] for i in region_1])
 leaf_2 = Node([i[0] for i in region_2], \
 [i[1] for i in region_2], \
 [i[2] for i in region_2])
 f = formula(region_1, leaf_1.avg, region_2, leaf_2.avg)
 candidates.append( (leaf_1, leaf_2, ('y', split_point), f) )
 sorted_f = sorted(candidates, key = lambda x: x[3])
 best_split = sorted_f[0]
 #The result:
 self.split_point = best_split[2]
 self.left_node = best_split[0]
 self.right_node = best_split[1]
#Source node and 1st split
source = Node(x_data, y_data, z_data)
source.split()
#Generate Binary Tree
result_nodes = [source.left_node, source.right_node]
all_nodes = [source.left_node, source.right_node]
min_nodes = 1000
min_points = 5
while len(all_nodes) <= min_nodes:
 next_nodes = []
 for node in result_nodes:
 if (len(node.points) > min_points):
 node.split()
 next_nodes.append(node.left_node)
 next_nodes.append(node.right_node)
 result_nodes = next_nodes
 all_nodes.extend(result_nodes)
 if len(result_nodes) == 0:
 break

Stack Exchange Network

Manual Regression Tree using Python

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Manual Regression Tree using Python

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions