2
\$\begingroup\$

I wrote a code to create a regression tree for a synthetic train data of size Np. The idea is, first I have the source node (which consists of all set of points) represented as a dictionary {'points':..., 'avg':..., 'left_node':..., 'right_node', 'split_point': }. The left and right nodes are the leafs after the splitting process of the whole data (source). split_point is for information about the best split. Then I loop to get deeper tree with maximum number of nodes specified before, also I set that a node must have more than 5 points in order it can be split.

This way, If I want to predict a point (x',y'), I can just start from source node source and check which region the point lies (left_node or right_node), ..and then continuing down the tree. Because all left_nodes and right_nodes values have the same structure as source....

Also, the form function is used to find the best split, the best split is the one with the smallest form(reg_1, avg1, reg_2, avg2). This is a greedy algorithm to find the best split.


I would like to know better ways to perform it..without external modules. But this is intended to be taught to high school students.


Full code:

import random
import matplotlib.pyplot as plt
def form(region_1, av1, region_2, av2):
 return sum([(i[1]-av1)**2 for i in region_1]) \
 + sum([(i[1]-av2)**2 for i in region_2])
Np = 400
x_data = [abs(random.gauss(5, 0.2) + random.gauss(8, 0.5)) for i in range(Np)]
y_data = [abs(random.gauss(10, 0.2) + random.uniform(0, 10)) for i in range(Np)]
value = [abs(random.gauss(4, 0.5)) for i in range(Np)]
data = [((i,j), k) for i,j,k in zip(x_data, y_data, value)]
fig, ax = plt.subplots()
ax.plot(x_data, y_data, 'o')
fig.show()
###### Splitting from the source node (all data)
source = {'points': data, 'avg': sum([i[1] for i in data])/Np, \
 'split_point': None, 'left_node': None, 'right_node': None}
forms = []
for x in x_data:
 var = x
 region_1 = [j for j in data if j[0][0] <= var]
 region_2 = [j for j in data if j not in region_1]
 if len(region_1) > 0 and len(region_2) > 0:
 av1 = sum([i[1] for i in region_1])/len(region_1)
 av2 = sum([i[1] for i in region_2])/len(region_2)
 f = form(region_1, av1, region_2, av2)
 leaf_1 = {'points': region_1, 'avg': av1}
 leaf_2 = {'points': region_2, 'avg': av2}
 forms.append( (leaf_1, leaf_2, ('x', var), f) )
for y in y_data:
 var = y
 region_1 = [j for j in data if j[0][1] <= var]
 region_2 = [j for j in data if j not in region_1]
 if len(region_1) > 0 and len(region_2) > 0:
 av1 = sum([i[1] for i in region_1])/len(region_1)
 av2 = sum([i[1] for i in region_2])/len(region_2)
 f = form(region_1, av1, region_2, av2)
 leaf_1 = {'points': region_1, 'avg': av1}
 leaf_2 = {'points': region_2, 'avg': av2}
 forms.append( (leaf_1, leaf_2, ('y', var), f) )
sorted_f = sorted(forms, key = lambda x: x[3])
best_split = sorted_f[0]
source['split_point'] = best_split[2]
source['left_node'] = best_split[0]
source['right_node'] = best_split[1]
##### Splitting from the 2 leafs and so on..
leafs = [source['left_node'], source['right_node']]
all_nodes = [leafs[0], leafs[1]]
max_nodes = 1000
while len(all_nodes) <= max_nodes:
 next_leafs = []
 for leaf in leafs:
 if (len(leaf['points']) > 5):
 xx = [i[0][0] for i in leaf['points']]
 yy = [i[0][1] for i in leaf['points']]
 rr = [i[1] for i in leaf['points']]
 vv = [((i,j), k) for i,j,k in zip(xx, yy, rr)]
 forms = []
 for x in xx:
 var = x
 region_1 = [j for j in vv if j[0][0] <= var]
 region_2 = [j for j in vv if j not in region_1]
 if len(region_1) > 0 and len(region_2) > 0:
 av1 = sum([i[1] for i in region_1])/len(region_1)
 av2 = sum([i[1] for i in region_2])/len(region_2)
 f = form(region_1, av1, region_2, av2)
 leaf_1 = {'points': region_1, 'avg': av1}
 leaf_2 = {'points': region_2, 'avg': av2}
 forms.append( (leaf_1, leaf_2, ('x', var), f) )
 for y in yy:
 var = y
 region_1 = [j for j in vv if j[0][1] <= var]
 region_2 = [j for j in vv if j not in region_1]
 if len(region_1) > 0 and len(region_2) > 0:
 av1 = sum([i[1] for i in region_1])/len(region_1)
 av2 = sum([i[1] for i in region_2])/len(region_2)
 f = form(region_1, av1, region_2, av2)
 leaf_1 = {'points': region_1, 'avg': av1}
 leaf_2 = {'points': region_2, 'avg': av2}
 forms.append( (leaf_1, leaf_2, ('y', var), f) )
 sorted_f = sorted(forms, key = lambda x: x[3])
 best_split = sorted_f[0]
 leaf['split_point'] = best_split[2]
 leaf['left_node'] = best_split[0]
 leaf['right_node'] = best_split[1]
 print(leaf['split_point'])
 next_leafs.append(leaf['left_node'])
 next_leafs.append(leaf['right_node'])
 print("\n")
 leafs = next_leafs
 all_nodes.extend(leafs)
 if len(leafs) == 0:
 break
asked Jul 2, 2019 at 9:48
\$\endgroup\$

1 Answer 1

1
\$\begingroup\$

Here is my updated version, It looks simpler and better by creating a class, Node.

import random
def formula(region_1, av1, region_2, av2):
 return sum([(i[1]-av1)**2 for i in region_1]) \
 + sum([(i[1]-av2)**2 for i in region_2])
def average(data):
 return sum([d[2] for d in data])/len(data)
Np = 400
x_data = [abs(random.gauss(5, 0.2) + random.gauss(8, 0.5)) for i in range(Np)]
y_data = [abs(random.gauss(10, 0.2) + random.uniform(0, 10)) for i in range(Np)]
z_data = [abs(random.gauss(4, 0.5)) for i in range(Np)]
class Node:
 def __init__(self, x_data, y_data, z_data):
 self.x_data = x_data
 self.y_data = y_data
 self.z_data = z_data
 self.points = [(i, j, k) for i, j, k in zip(x_data, y_data, z_data)]
 self.avg = average(self.points)
 def split(self):
 #Finding the best split:
 candidates = []
 for x in self.x_data:
 split_point = x
 region_1 = [i for i in self.points if i[0] <= split_point]
 region_2 = [i for i in self.points if i not in region_1]
 if (region_1 != []) and (region_2 != []):
 leaf_1 = Node([i[0] for i in region_1], \
 [i[1] for i in region_1], \
 [i[2] for i in region_1])
 leaf_2 = Node([i[0] for i in region_2], \
 [i[1] for i in region_2], \
 [i[2] for i in region_2])
 f = formula(region_1, leaf_1.avg, region_2, leaf_2.avg)
 candidates.append( (leaf_1, leaf_2, ('x', split_point), f) )
 for y in self.y_data:
 split_point = y
 region_1 = [i for i in self.points if i[1] <= split_point]
 region_2 = [i for i in self.points if i not in region_1]
 if (region_1 != []) and (region_2 != []):
 leaf_1 = Node([i[0] for i in region_1], \
 [i[1] for i in region_1], \
 [i[2] for i in region_1])
 leaf_2 = Node([i[0] for i in region_2], \
 [i[1] for i in region_2], \
 [i[2] for i in region_2])
 f = formula(region_1, leaf_1.avg, region_2, leaf_2.avg)
 candidates.append( (leaf_1, leaf_2, ('y', split_point), f) )
 sorted_f = sorted(candidates, key = lambda x: x[3])
 best_split = sorted_f[0]
 #The result:
 self.split_point = best_split[2]
 self.left_node = best_split[0]
 self.right_node = best_split[1]
#Source node and 1st split
source = Node(x_data, y_data, z_data)
source.split()
#Generate Binary Tree
result_nodes = [source.left_node, source.right_node]
all_nodes = [source.left_node, source.right_node]
min_nodes = 1000
min_points = 5
while len(all_nodes) <= min_nodes:
 next_nodes = []
 for node in result_nodes:
 if (len(node.points) > min_points):
 node.split()
 next_nodes.append(node.left_node)
 next_nodes.append(node.right_node)
 result_nodes = next_nodes
 all_nodes.extend(result_nodes)
 if len(result_nodes) == 0:
 break
answered Jul 3, 2019 at 4:42
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.