I wrote a code to create a regression tree for a synthetic train data of size Np
. The idea is, first I have the source node (which consists of all set of points) represented as a dictionary {'points':..., 'avg':..., 'left_node':..., 'right_node', 'split_point': }
. The left and right nodes are the leafs after the splitting process of the whole data (source). split_point
is for information about the best split. Then I loop to get deeper tree with maximum number of nodes specified before, also I set that a node must have more than 5 points in order it can be split.
This way, If I want to predict a point (x',y')
, I can just start from source node source
and check which region the point lies (left_node
or right_node
), ..and then continuing down the tree. Because all left_node
s and right_node
s values have the same structure as source
....
Also, the form
function is used to find the best split, the best split is the one with the smallest form(reg_1, avg1, reg_2, avg2)
. This is a greedy algorithm to find the best split.
I would like to know better ways to perform it..without external modules. But this is intended to be taught to high school students.
Full code:
import random
import matplotlib.pyplot as plt
def form(region_1, av1, region_2, av2):
return sum([(i[1]-av1)**2 for i in region_1]) \
+ sum([(i[1]-av2)**2 for i in region_2])
Np = 400
x_data = [abs(random.gauss(5, 0.2) + random.gauss(8, 0.5)) for i in range(Np)]
y_data = [abs(random.gauss(10, 0.2) + random.uniform(0, 10)) for i in range(Np)]
value = [abs(random.gauss(4, 0.5)) for i in range(Np)]
data = [((i,j), k) for i,j,k in zip(x_data, y_data, value)]
fig, ax = plt.subplots()
ax.plot(x_data, y_data, 'o')
fig.show()
###### Splitting from the source node (all data)
source = {'points': data, 'avg': sum([i[1] for i in data])/Np, \
'split_point': None, 'left_node': None, 'right_node': None}
forms = []
for x in x_data:
var = x
region_1 = [j for j in data if j[0][0] <= var]
region_2 = [j for j in data if j not in region_1]
if len(region_1) > 0 and len(region_2) > 0:
av1 = sum([i[1] for i in region_1])/len(region_1)
av2 = sum([i[1] for i in region_2])/len(region_2)
f = form(region_1, av1, region_2, av2)
leaf_1 = {'points': region_1, 'avg': av1}
leaf_2 = {'points': region_2, 'avg': av2}
forms.append( (leaf_1, leaf_2, ('x', var), f) )
for y in y_data:
var = y
region_1 = [j for j in data if j[0][1] <= var]
region_2 = [j for j in data if j not in region_1]
if len(region_1) > 0 and len(region_2) > 0:
av1 = sum([i[1] for i in region_1])/len(region_1)
av2 = sum([i[1] for i in region_2])/len(region_2)
f = form(region_1, av1, region_2, av2)
leaf_1 = {'points': region_1, 'avg': av1}
leaf_2 = {'points': region_2, 'avg': av2}
forms.append( (leaf_1, leaf_2, ('y', var), f) )
sorted_f = sorted(forms, key = lambda x: x[3])
best_split = sorted_f[0]
source['split_point'] = best_split[2]
source['left_node'] = best_split[0]
source['right_node'] = best_split[1]
##### Splitting from the 2 leafs and so on..
leafs = [source['left_node'], source['right_node']]
all_nodes = [leafs[0], leafs[1]]
max_nodes = 1000
while len(all_nodes) <= max_nodes:
next_leafs = []
for leaf in leafs:
if (len(leaf['points']) > 5):
xx = [i[0][0] for i in leaf['points']]
yy = [i[0][1] for i in leaf['points']]
rr = [i[1] for i in leaf['points']]
vv = [((i,j), k) for i,j,k in zip(xx, yy, rr)]
forms = []
for x in xx:
var = x
region_1 = [j for j in vv if j[0][0] <= var]
region_2 = [j for j in vv if j not in region_1]
if len(region_1) > 0 and len(region_2) > 0:
av1 = sum([i[1] for i in region_1])/len(region_1)
av2 = sum([i[1] for i in region_2])/len(region_2)
f = form(region_1, av1, region_2, av2)
leaf_1 = {'points': region_1, 'avg': av1}
leaf_2 = {'points': region_2, 'avg': av2}
forms.append( (leaf_1, leaf_2, ('x', var), f) )
for y in yy:
var = y
region_1 = [j for j in vv if j[0][1] <= var]
region_2 = [j for j in vv if j not in region_1]
if len(region_1) > 0 and len(region_2) > 0:
av1 = sum([i[1] for i in region_1])/len(region_1)
av2 = sum([i[1] for i in region_2])/len(region_2)
f = form(region_1, av1, region_2, av2)
leaf_1 = {'points': region_1, 'avg': av1}
leaf_2 = {'points': region_2, 'avg': av2}
forms.append( (leaf_1, leaf_2, ('y', var), f) )
sorted_f = sorted(forms, key = lambda x: x[3])
best_split = sorted_f[0]
leaf['split_point'] = best_split[2]
leaf['left_node'] = best_split[0]
leaf['right_node'] = best_split[1]
print(leaf['split_point'])
next_leafs.append(leaf['left_node'])
next_leafs.append(leaf['right_node'])
print("\n")
leafs = next_leafs
all_nodes.extend(leafs)
if len(leafs) == 0:
break
1 Answer 1
Here is my updated version, It looks simpler and better by creating a class, Node
.
import random
def formula(region_1, av1, region_2, av2):
return sum([(i[1]-av1)**2 for i in region_1]) \
+ sum([(i[1]-av2)**2 for i in region_2])
def average(data):
return sum([d[2] for d in data])/len(data)
Np = 400
x_data = [abs(random.gauss(5, 0.2) + random.gauss(8, 0.5)) for i in range(Np)]
y_data = [abs(random.gauss(10, 0.2) + random.uniform(0, 10)) for i in range(Np)]
z_data = [abs(random.gauss(4, 0.5)) for i in range(Np)]
class Node:
def __init__(self, x_data, y_data, z_data):
self.x_data = x_data
self.y_data = y_data
self.z_data = z_data
self.points = [(i, j, k) for i, j, k in zip(x_data, y_data, z_data)]
self.avg = average(self.points)
def split(self):
#Finding the best split:
candidates = []
for x in self.x_data:
split_point = x
region_1 = [i for i in self.points if i[0] <= split_point]
region_2 = [i for i in self.points if i not in region_1]
if (region_1 != []) and (region_2 != []):
leaf_1 = Node([i[0] for i in region_1], \
[i[1] for i in region_1], \
[i[2] for i in region_1])
leaf_2 = Node([i[0] for i in region_2], \
[i[1] for i in region_2], \
[i[2] for i in region_2])
f = formula(region_1, leaf_1.avg, region_2, leaf_2.avg)
candidates.append( (leaf_1, leaf_2, ('x', split_point), f) )
for y in self.y_data:
split_point = y
region_1 = [i for i in self.points if i[1] <= split_point]
region_2 = [i for i in self.points if i not in region_1]
if (region_1 != []) and (region_2 != []):
leaf_1 = Node([i[0] for i in region_1], \
[i[1] for i in region_1], \
[i[2] for i in region_1])
leaf_2 = Node([i[0] for i in region_2], \
[i[1] for i in region_2], \
[i[2] for i in region_2])
f = formula(region_1, leaf_1.avg, region_2, leaf_2.avg)
candidates.append( (leaf_1, leaf_2, ('y', split_point), f) )
sorted_f = sorted(candidates, key = lambda x: x[3])
best_split = sorted_f[0]
#The result:
self.split_point = best_split[2]
self.left_node = best_split[0]
self.right_node = best_split[1]
#Source node and 1st split
source = Node(x_data, y_data, z_data)
source.split()
#Generate Binary Tree
result_nodes = [source.left_node, source.right_node]
all_nodes = [source.left_node, source.right_node]
min_nodes = 1000
min_points = 5
while len(all_nodes) <= min_nodes:
next_nodes = []
for node in result_nodes:
if (len(node.points) > min_points):
node.split()
next_nodes.append(node.left_node)
next_nodes.append(node.right_node)
result_nodes = next_nodes
all_nodes.extend(result_nodes)
if len(result_nodes) == 0:
break
Explore related questions
See similar questions with these tags.