I've been working on random forest algorithm for classification with roulette selection to find best splits. I came up with this homebrew based on this article https://machinelearningmastery.com/implement-random-forest-scratch-python/.
It works well enough, but I have a major problem with execution time. It's reasonably fast for 1-6 trees, but it completely chokes on 7 and more. I'm looking for ways to make it more efficient, so it doesn't take a month to finish basic tests. I'm a beginner, and I don't know much about optimizing code, so I'd appreciate any suggestions on how I could improve. Here's the link to Github repo: https://github.com/pdabrow11/Random-Forest-Roulette. It contains all .py
files and datasets used for classification. Here's the raw code:
- data_load.py
import csv
def read_file(file):
with open(file, 'r') as myfile:
data = myfile.readlines()
return data
def load_data(data, var):
dataset = list()
for row in data:
row = row.rstrip("\n")
if var == 1:
dataset.append(row.split(","))
if var == 2:
dataset.append(row.split(" "))
if var == 3:
dataset.append(row.split(";"))
return dataset
def process_dat_data(data):
dataset = list()
for row in data:
row_temp = list()
for i in range(1, len(row)):
res = row[i].find(':')
row_temp.append(row[i][res+1:])
row_temp.append(row[0])
dataset.append(row_temp)
return dataset
def str2int(dataset):
for i in range(0, len(dataset[0])):
class_values = [row[i] for row in dataset]
unique = set(class_values)
lookup = dict()
for j, value in enumerate(unique):
lookup[value] = float(j)
for row in dataset:
row[i] = lookup[row[i]]
def str2float(dataset):
for row in dataset:
for i in range(0, len(row)):
row[i] = float(row[i])
return dataset
def write_data(data):
with open("tests_res.csv", 'a') as file:
f = csv.writer(file, delimiter =',', lineterminator='\n')
f.writerows(data)
- random_forest.py
from math import sqrt
from random import seed
from random import randrange
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
import numpy as np
import numpy.random as npr
def cross_validation_split(dataset, folds_num):
dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / folds_num)
for i in range(folds_num):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
fold.append(dataset_copy.pop(index))
dataset_split.append(fold)
return dataset_split
def calc_accuracy(actual, predicted):
correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0
def evaluate_algorithm(dataset, folds_num, trees_num, *args):
attributes_num = int(sqrt(len(dataset[0])-1))
folds = cross_validation_split(dataset, folds_num)
scores = list()
scores_sklearn = list()
res = RandomForestClassifier(n_estimators=trees_num)
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
y_test_set = list()
X, y = list(), list()
for row in fold:
#row_copy = list(row)
#row_copy[-1] = None
test_set.append(row[:-1])
y_test_set.append(row[-1])
predicted = random_forest(train_set, test_set, trees_num, attributes_num, *args)
actual = [row[-1] for row in fold]
accuracy = calc_accuracy(actual, predicted)
scores.append(accuracy)
# Sklearn
for row in train_set:
X.append(row[:-1])
y.append(row[-1])
#res = clone(res)
res.fit(X, y)
accuracy_sklearn = res.score(test_set, y_test_set)*100
scores_sklearn.append(accuracy_sklearn)
return scores, scores_sklearn
def split_attribute(dataset, index, value):
left, right = list(), list()
for row in dataset:
if row[index] < value:
left.append(row)
else:
right.append(row)
return left, right
def subset(dataset, ratio):
sample = list()
sample_num = round(len(dataset) * ratio)
while len(sample) < sample_num:
index = randrange(len(dataset))
sample.append(dataset[index])
return sample
def gini_index(groups, classes):
examples_num = int(sum([len(group) for group in groups]))
gini = 0.0
for group in groups:
size = int(len(group))
if size == 0:
continue
P, E = 0.0, 1.0
for single_class in classes:
P = [row[-1] for row in group].count(single_class) / size
E -= P ** 2
gini += (size / examples_num) * E
return gini
def roulette_split(dataset, attributes_num, threshold):
classes = list(set(row[-1] for row in dataset))
index_list, val_list, group_list, fit = [], [], [], []
attributes = list()
while len(attributes) < attributes_num:
index = randrange(len(dataset[0])-1)
if index not in attributes:
attributes.append(index)
counter = 0
for index in attributes:
for row in dataset:
groups = split_attribute(dataset, index, row[index])
gini = gini_index(groups, classes)
index_list.append(index)
val_list.append(row[index])
group_list.append(groups)
fit.append(1-gini)
counter += 1
wheel_size = 0
fit_args_sorted = np.argsort(fit)
for i in range (0, int(threshold*counter)):
fit[fit_args_sorted[i]] = 0
for i in range (0, counter):
wheel_size += fit[i]
selection_probs = [fit[i]/wheel_size for i in range (0, counter)]
winner = int(npr.choice(np.arange(counter), 1, p=selection_probs))
return {'val':val_list[winner], 'groups':group_list[winner], 'index':index_list[winner]}
def terminal(group):
results = [row[-1] for row in group]
return max(results, key=results.count)
def one_class(node):
res = True
for i in range(0, len(node)):
if node[0][-1] != node[i][-1]:
res = False
return res
return res
def are_the_same(node):
res = True
for i in range(0, len(node[0])-1):
for j in range(0, len(node)):
for k in range(0, len(node)):
if node[j][i] != node[k][i]:
res = False
return res
return res
def split(node, attributes_num, depth, threshold):
left, right = node['groups']
del(node['groups'])
if not left or not right:
node['left'] = node['right'] = terminal(left + right)
return
if one_class(left) or are_the_same(left):
node['left'] = terminal(left)
else:
node['left'] = roulette_split(left, attributes_num, threshold)
split(node['left'], attributes_num, depth+1, threshold)
if one_class(right) or are_the_same(right):
node['right'] = terminal(right)
else:
node['right'] = roulette_split(right, attributes_num, threshold)
split(node['right'], attributes_num, depth+1, threshold)
def build_tree(train, attributes_num, threshold):
root = roulette_split(train, attributes_num, threshold)
split(root, attributes_num, 1, threshold)
return root
def predict(node, row):
if row[node['index']] < node['val']:
if isinstance(node['left'], dict):
return predict(node['left'], row)
else:
return node['left']
else:
if isinstance(node['right'], dict):
return predict(node['right'], row)
else:
return node['right']
def bagging_predict(trees, row):
predictions = [predict(tree, row) for tree in trees]
return max(set(predictions), key=predictions.count)
def random_forest(train, test, attributes_num, trees_num, sample_size, threshold):
trees = list()
for i in range(trees_num):
sample = subset(train, sample_size)
tree = build_tree(sample, attributes_num, threshold)
trees.append(tree)
predictions = [bagging_predict(trees, row) for row in test]
return(predictions)
- launch.py
from asyncore import write
from random_forest import *
from data_load import *
from random_forest import *
from data_load import *
def main():
data_absence = read_file("datasets_v2/Absenteeism_at_work.csv")
dataset_absence = load_data(data_absence, 3)
dataset_absence = dataset_absence[1:]
str2float(dataset_absence)
data_car = read_file("datasets_v2/car.data")
dataset_car = load_data(data_car, 1)
str2int(dataset_car)
data_opt = read_file("datasets_v2/batch_optdigits.tra")
dataset_opt = load_data(data_opt, 1)
str2float(dataset_opt)
data_gas = read_file("datasets_v2/GasSensorDataset/batch1.dat")
dataset_gas = load_data(data_gas, 2)
dataset_gas = process_dat_data(dataset_gas)
str2float(dataset_gas)
sk_scores = []
for_scores = []
col_1 = []
col_2 = []
col_3 = []
col_4 = []
col_5 = []
col_6 = []
col_7 = []
col_8 = []
for dataset in [dataset_car, dataset_absence, dataset_gas]:
sample_size = 0.05
folds_num = 5
for threshold in [0.2 , 0.5, 0.9]:
print('Dla mojej informacji - threshold: ', threshold, '\n')
for trees_num in range(1,10):
sk_scores.clear()
for_scores.clear()
for i in [1,2,3,5]:
seed(i)
scores = evaluate_algorithm(dataset, folds_num, trees_num, sample_size, threshold)
score = sum(scores[0])/float(len(scores[0]))
for_scores.append(score)
sk_score = sum(scores[1])/float(len(scores[1]))
sk_scores.append(sk_score)
if dataset == dataset_car:
col_1.append('SECOM Dataset')
col_1.append('SECOM Dataset')
elif dataset == dataset_gas:
col_1.append('Gas Sensor Dataset')
col_1.append('Gas Sensor Dataset')
else:
col_1.append('Absenteeism Dataset')
col_1.append('Absenteeism Dataset')
for_mean = round(np.mean(for_scores),2)
for_stdev = round(np.std(for_scores),2)
for_best = round(np.amax(for_scores),2)
for_worst = round(np.amax(for_scores),2)
sk_mean = round(np.mean(sk_scores),2)
sk_stdev = round(np.std(sk_scores),2)
sk_best = round(np.amax(sk_scores),2)
sk_worst = round(np.amax(sk_scores),2)
col_2.append('Własna implementacja')
col_2.append('Sklearn')
col_3.append(trees_num)
col_3.append(trees_num)
col_4.append(threshold)
col_4.append(threshold)
col_5.append(for_mean)
col_6.append(for_stdev)
col_7.append(for_best)
col_8.append(for_worst)
col_5.append(sk_mean)
col_6.append(sk_stdev)
col_7.append(sk_best)
col_8.append(sk_worst)
print(trees_num)
header = ['Zbiór danych', 'Implementacja', 'Ilość drzew', 'Próg selekcji', 'Średnia jakość', 'Odchylenie standardowe', 'Najlepsza jakość', 'Najgorsza jakość']
file_data = np.column_stack((col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8))
file_data = np.vstack((header, file_data))
open('tests_res.csv', 'w').close()
write_data(file_data)
if __name__ == "__main__":
main()
1 Answer 1
Documentation
The PEP 8 style guide recommends adding docstrings for files and functions.
For example, for the data_load.py
file, since "data" is a generic term,
it would be good to describe what kind of data is being loaded.
For functions, it would be useful for the docstring to have details about input types and return types.
Naming
There are many variables with the generic word "data". I suggest renaming some of them to be more specific about the type of data they represent.
The function named process_dat_data
even has "dat" twice. That seems redundant,
and one of them can be replaced with a better word.
In the load_data
function, the var
input does not convey much meaning.
It might be better as separator_type
.
Efficiency
In the load_data
function for
loop, the 3 separate if
statements
should be combined into a single if/elif
statement:
if var == 1:
dataset.append(row.split(","))
elif var == 2:
dataset.append(row.split(" "))
elif var == 3:
The checks are mutually exclusive. This makes the code more efficient since you don't have to perform the 2nd check if the first is true, etc. Also, this more clearly shows the intent of the code.
An alternate approach is to avoid the intermediate var
mapping variable,
and directly pass in the separator:
def load_data(data, separator):
dataset = list()
for row in data:
row = row.rstrip("\n")
if separator in [",", " ", ";"]:
dataset.append(row.split(separator))
return dataset
DRY
In the launch.py
file, these 2 lines are repeated:
from random_forest import *
from data_load import *
You should delete the copies.
In the main
function, there is a lot of repetitious calls to append
:
if dataset == dataset_car:
col_1.append('SECOM Dataset')
col_1.append('SECOM Dataset')
elif dataset == dataset_gas:
col_1.append('Gas Sensor Dataset')
col_1.append('Gas Sensor Dataset')
else:
col_1.append('Absenteeism Dataset')
col_1.append('Absenteeism Dataset')
They can be factored out of the if/else
text = ''
if dataset == dataset_car:
text = 'SECOM'
elif dataset == dataset_gas:
text = 'Gas Sensor'
else:
text = 'Absenteeism'
for _ in range(2):
col_1.append(f"{text} Dataset")
Simpler
In the random_forest.py
file, the one_class
can be simplified by
removing the res
variable:
def one_class(node):
for i in range(len(node)):
if node[0][-1] != node[i][-1]:
return False
return True
The range
start 0 can also be removed.
The same can be done for the are_the_same
function.