4
\$\begingroup\$

I've been working on random forest algorithm for classification with roulette selection to find best splits. I came up with this homebrew based on this article https://machinelearningmastery.com/implement-random-forest-scratch-python/.

It works well enough, but I have a major problem with execution time. It's reasonably fast for 1-6 trees, but it completely chokes on 7 and more. I'm looking for ways to make it more efficient, so it doesn't take a month to finish basic tests. I'm a beginner, and I don't know much about optimizing code, so I'd appreciate any suggestions on how I could improve. Here's the link to Github repo: https://github.com/pdabrow11/Random-Forest-Roulette. It contains all .py files and datasets used for classification. Here's the raw code:

  • data_load.py
import csv
def read_file(file):
 with open(file, 'r') as myfile:
 data = myfile.readlines()
 return data
def load_data(data, var):
 dataset = list()
 for row in data:
 row = row.rstrip("\n")
 if var == 1:
 dataset.append(row.split(","))
 if var == 2:
 dataset.append(row.split(" "))
 if var == 3:
 dataset.append(row.split(";"))
 return dataset
def process_dat_data(data):
 dataset = list()
 for row in data:
 row_temp = list()
 for i in range(1, len(row)):
 res = row[i].find(':')
 row_temp.append(row[i][res+1:])
 row_temp.append(row[0])
 dataset.append(row_temp)
 return dataset
def str2int(dataset):
 for i in range(0, len(dataset[0])):
 class_values = [row[i] for row in dataset]
 unique = set(class_values)
 lookup = dict()
 for j, value in enumerate(unique):
 lookup[value] = float(j)
 for row in dataset:
 row[i] = lookup[row[i]]
def str2float(dataset):
 for row in dataset:
 for i in range(0, len(row)):
 row[i] = float(row[i])
 return dataset
def write_data(data):
 with open("tests_res.csv", 'a') as file:
 f = csv.writer(file, delimiter =',', lineterminator='\n')
 f.writerows(data)
  • random_forest.py
from math import sqrt
from random import seed
from random import randrange
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
import numpy as np
import numpy.random as npr
def cross_validation_split(dataset, folds_num):
 dataset_split = list()
 dataset_copy = list(dataset)
 fold_size = int(len(dataset) / folds_num)
 for i in range(folds_num):
 fold = list()
 while len(fold) < fold_size:
 index = randrange(len(dataset_copy))
 fold.append(dataset_copy.pop(index))
 dataset_split.append(fold)
 return dataset_split
 
def calc_accuracy(actual, predicted):
 correct = 0
 for i in range(len(actual)):
 if actual[i] == predicted[i]:
 correct += 1
 return correct / float(len(actual)) * 100.0
 
def evaluate_algorithm(dataset, folds_num, trees_num, *args):
 attributes_num = int(sqrt(len(dataset[0])-1))
 folds = cross_validation_split(dataset, folds_num)
 scores = list()
 scores_sklearn = list()
 res = RandomForestClassifier(n_estimators=trees_num)
 for fold in folds:
 train_set = list(folds)
 train_set.remove(fold)
 train_set = sum(train_set, [])
 test_set = list()
 y_test_set = list()
 X, y = list(), list()
 for row in fold:
 #row_copy = list(row)
 #row_copy[-1] = None
 test_set.append(row[:-1])
 y_test_set.append(row[-1])
 predicted = random_forest(train_set, test_set, trees_num, attributes_num, *args)
 actual = [row[-1] for row in fold]
 accuracy = calc_accuracy(actual, predicted)
 scores.append(accuracy)
 # Sklearn
 for row in train_set:
 X.append(row[:-1])
 y.append(row[-1])
 #res = clone(res)
 res.fit(X, y)
 accuracy_sklearn = res.score(test_set, y_test_set)*100
 scores_sklearn.append(accuracy_sklearn)
 return scores, scores_sklearn
 
def split_attribute(dataset, index, value):
 left, right = list(), list()
 for row in dataset:
 if row[index] < value:
 left.append(row)
 else:
 right.append(row)
 return left, right
 
def subset(dataset, ratio):
 sample = list()
 sample_num = round(len(dataset) * ratio)
 while len(sample) < sample_num:
 index = randrange(len(dataset))
 sample.append(dataset[index])
 return sample
def gini_index(groups, classes):
 examples_num = int(sum([len(group) for group in groups]))
 gini = 0.0
 for group in groups:
 size = int(len(group))
 if size == 0:
 continue
 P, E = 0.0, 1.0
 for single_class in classes:
 P = [row[-1] for row in group].count(single_class) / size
 E -= P ** 2
 gini += (size / examples_num) * E
 return gini
 
def roulette_split(dataset, attributes_num, threshold):
 classes = list(set(row[-1] for row in dataset))
 index_list, val_list, group_list, fit = [], [], [], []
 attributes = list()
 while len(attributes) < attributes_num:
 index = randrange(len(dataset[0])-1)
 if index not in attributes:
 attributes.append(index)
 counter = 0
 for index in attributes:
 for row in dataset:
 groups = split_attribute(dataset, index, row[index])
 gini = gini_index(groups, classes)
 index_list.append(index)
 val_list.append(row[index])
 group_list.append(groups)
 fit.append(1-gini)
 counter += 1
 wheel_size = 0
 fit_args_sorted = np.argsort(fit)
 for i in range (0, int(threshold*counter)):
 fit[fit_args_sorted[i]] = 0
 for i in range (0, counter):
 wheel_size += fit[i]
 
 selection_probs = [fit[i]/wheel_size for i in range (0, counter)]
 winner = int(npr.choice(np.arange(counter), 1, p=selection_probs))
 return {'val':val_list[winner], 'groups':group_list[winner], 'index':index_list[winner]}
def terminal(group):
 results = [row[-1] for row in group]
 return max(results, key=results.count)
def one_class(node):
 res = True
 for i in range(0, len(node)):
 if node[0][-1] != node[i][-1]:
 res = False
 return res
 return res
def are_the_same(node):
 res = True
 for i in range(0, len(node[0])-1):
 for j in range(0, len(node)):
 for k in range(0, len(node)):
 if node[j][i] != node[k][i]:
 res = False
 return res
 return res
def split(node, attributes_num, depth, threshold):
 left, right = node['groups']
 del(node['groups'])
 if not left or not right:
 node['left'] = node['right'] = terminal(left + right)
 return
 if one_class(left) or are_the_same(left):
 node['left'] = terminal(left)
 else:
 node['left'] = roulette_split(left, attributes_num, threshold)
 split(node['left'], attributes_num, depth+1, threshold)
 if one_class(right) or are_the_same(right):
 node['right'] = terminal(right)
 else:
 node['right'] = roulette_split(right, attributes_num, threshold)
 split(node['right'], attributes_num, depth+1, threshold)
 
def build_tree(train, attributes_num, threshold):
 root = roulette_split(train, attributes_num, threshold)
 split(root, attributes_num, 1, threshold)
 return root
 
def predict(node, row):
 if row[node['index']] < node['val']:
 if isinstance(node['left'], dict):
 return predict(node['left'], row)
 else:
 return node['left']
 else:
 if isinstance(node['right'], dict):
 return predict(node['right'], row)
 else:
 return node['right']
 
def bagging_predict(trees, row):
 predictions = [predict(tree, row) for tree in trees]
 return max(set(predictions), key=predictions.count)
 
def random_forest(train, test, attributes_num, trees_num, sample_size, threshold):
 trees = list()
 for i in range(trees_num):
 sample = subset(train, sample_size)
 tree = build_tree(sample, attributes_num, threshold)
 trees.append(tree)
 predictions = [bagging_predict(trees, row) for row in test]
 return(predictions)
  • launch.py
from asyncore import write
from random_forest import *
from data_load import *
from random_forest import *
from data_load import *
def main():
 data_absence = read_file("datasets_v2/Absenteeism_at_work.csv")
 dataset_absence = load_data(data_absence, 3)
 dataset_absence = dataset_absence[1:]
 str2float(dataset_absence)
 data_car = read_file("datasets_v2/car.data")
 dataset_car = load_data(data_car, 1)
 str2int(dataset_car)
 
 data_opt = read_file("datasets_v2/batch_optdigits.tra")
 dataset_opt = load_data(data_opt, 1)
 str2float(dataset_opt)
 data_gas = read_file("datasets_v2/GasSensorDataset/batch1.dat")
 dataset_gas = load_data(data_gas, 2)
 dataset_gas = process_dat_data(dataset_gas)
 str2float(dataset_gas)
 
 sk_scores = []
 for_scores = []
 col_1 = []
 col_2 = []
 col_3 = []
 col_4 = []
 col_5 = []
 col_6 = []
 col_7 = []
 col_8 = []
 for dataset in [dataset_car, dataset_absence, dataset_gas]:
 sample_size = 0.05
 folds_num = 5
 for threshold in [0.2 , 0.5, 0.9]:
 print('Dla mojej informacji - threshold: ', threshold, '\n')
 for trees_num in range(1,10):
 sk_scores.clear()
 for_scores.clear()
 for i in [1,2,3,5]:
 seed(i)
 scores = evaluate_algorithm(dataset, folds_num, trees_num, sample_size, threshold)
 score = sum(scores[0])/float(len(scores[0]))
 for_scores.append(score)
 sk_score = sum(scores[1])/float(len(scores[1]))
 sk_scores.append(sk_score)
 if dataset == dataset_car:
 col_1.append('SECOM Dataset')
 col_1.append('SECOM Dataset')
 elif dataset == dataset_gas:
 col_1.append('Gas Sensor Dataset')
 col_1.append('Gas Sensor Dataset')
 else:
 col_1.append('Absenteeism Dataset')
 col_1.append('Absenteeism Dataset')
 for_mean = round(np.mean(for_scores),2)
 for_stdev = round(np.std(for_scores),2)
 for_best = round(np.amax(for_scores),2)
 for_worst = round(np.amax(for_scores),2)
 sk_mean = round(np.mean(sk_scores),2)
 sk_stdev = round(np.std(sk_scores),2)
 sk_best = round(np.amax(sk_scores),2)
 sk_worst = round(np.amax(sk_scores),2)
 col_2.append('Własna implementacja')
 col_2.append('Sklearn')
 col_3.append(trees_num)
 col_3.append(trees_num)
 col_4.append(threshold)
 col_4.append(threshold)
 col_5.append(for_mean)
 col_6.append(for_stdev)
 col_7.append(for_best)
 col_8.append(for_worst)
 col_5.append(sk_mean)
 col_6.append(sk_stdev)
 col_7.append(sk_best)
 col_8.append(sk_worst)
 print(trees_num)
 header = ['Zbiór danych', 'Implementacja', 'Ilość drzew', 'Próg selekcji', 'Średnia jakość', 'Odchylenie standardowe', 'Najlepsza jakość', 'Najgorsza jakość']
 file_data = np.column_stack((col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8))
 file_data = np.vstack((header, file_data))
 open('tests_res.csv', 'w').close()
 write_data(file_data)
if __name__ == "__main__":
 main()
toolic
14.5k5 gold badges29 silver badges203 bronze badges
asked Jan 15, 2022 at 17:30
\$\endgroup\$

1 Answer 1

1
\$\begingroup\$

Documentation

The PEP 8 style guide recommends adding docstrings for files and functions.

For example, for the data_load.py file, since "data" is a generic term, it would be good to describe what kind of data is being loaded.

For functions, it would be useful for the docstring to have details about input types and return types.

Naming

There are many variables with the generic word "data". I suggest renaming some of them to be more specific about the type of data they represent.

The function named process_dat_data even has "dat" twice. That seems redundant, and one of them can be replaced with a better word.

In the load_data function, the var input does not convey much meaning. It might be better as separator_type.

Efficiency

In the load_data function for loop, the 3 separate if statements should be combined into a single if/elif statement:

if var == 1:
 dataset.append(row.split(","))
elif var == 2:
 dataset.append(row.split(" "))
elif var == 3:

The checks are mutually exclusive. This makes the code more efficient since you don't have to perform the 2nd check if the first is true, etc. Also, this more clearly shows the intent of the code.

An alternate approach is to avoid the intermediate var mapping variable, and directly pass in the separator:

def load_data(data, separator):
 dataset = list()
 for row in data:
 row = row.rstrip("\n")
 if separator in [",", " ", ";"]:
 dataset.append(row.split(separator))
 return dataset

DRY

In the launch.py file, these 2 lines are repeated:

from random_forest import *
from data_load import *

You should delete the copies.

In the main function, there is a lot of repetitious calls to append:

if dataset == dataset_car:
 col_1.append('SECOM Dataset')
 col_1.append('SECOM Dataset')
elif dataset == dataset_gas:
 col_1.append('Gas Sensor Dataset')
 col_1.append('Gas Sensor Dataset')
else:
 col_1.append('Absenteeism Dataset')
 col_1.append('Absenteeism Dataset')

They can be factored out of the if/else

text = ''
if dataset == dataset_car:
 text = 'SECOM'
elif dataset == dataset_gas:
 text = 'Gas Sensor'
else:
 text = 'Absenteeism'
for _ in range(2):
 col_1.append(f"{text} Dataset")

Simpler

In the random_forest.py file, the one_class can be simplified by removing the res variable:

def one_class(node):
 for i in range(len(node)):
 if node[0][-1] != node[i][-1]:
 return False
 return True

The range start 0 can also be removed.

The same can be done for the are_the_same function.

answered Apr 9 at 17:11
\$\endgroup\$

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.