machine_learning/decision_tree.py · codenotsleep/Python

代码拉取完成,页面将自动刷新

扫描微信二维码支付

取消

支付完成

richgiteeai

Watch

不关注关注所有动态仅关注版本发行动态关注但不提醒动态

1 Star 0 Fork 324

codenotsleep/Python

forked from 编程语言算法集/Python

代码 Issues 0 Pull Requests 0 Wiki 统计流水线

服务

加入 Gitee

与超过 1400万开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)

免费加入

已有帐号? 立即登录

文件

master

分支 (76)

管理

master

fix-ruff

pipx-install-pre-commit-ruff

ruff-rule-ISC001

Re-enable-tests

Rename-is_palindrome.py-to-is_int_palindrome.py

Add-more-ruff-rules

pre-commit-ci-update-config

ruff

atbash.py-Tighten-up-the-benchmarks

prime_numbers.py-Tighten-up-the-benchmarks

dynamic_programming

maths/sum_of_digits.py-Streamline-benchmarks

maths/number_of_digits.py-Streamline-benchmarks

Python-3.11

7804-improve-the-maths-add-functionality-and-tests

7782-create-subtraction-method-in-maths-folder

quantum_random.py.DISABLED.txt

quantum_random.py.disabled

revert-7349-patch-4

克隆/下载

HTTPS SSH SVN SVN+SSH 下载ZIP

提示

下载代码请复制以下命令到终端执行

为确保你提交的代码身份被 Gitee 正确识别,请执行以下命令完成配置

git config --global user.name userName 
git config --global user.email userEmail

初次使用 SSH 协议进行代码克隆、推送等操作时,需按下述提示完成 SSH 配置

1 生成 RSA 密钥

2 获取 RSA 公钥内容,并配置到 SSH公钥中

在 Gitee 上使用 SVN,请访问使用指南

使用 HTTPS 协议时,命令行会出现如下账号密码验证步骤。基于安全考虑,Gitee 建议配置并使用私人令牌替代登录密码进行克隆、推送等操作

Username for 'https://gitee.com': userName

Password for 'https://userName@gitee.com': # 私人令牌

分支 76

标签 0

Python

machine_learning

decision_tree.py

decision_tree.py 6.00 KB

"""
Implementation of a basic regression decision tree.
Input data set: The input data set must be 1-dimensional with continuous labels.
Output: The decision tree maps a real number input to a real number output.
"""
import numpy as np

class DecisionTree:
 def __init__(self, depth=5, min_leaf_size=5):
 self.depth = depth
 self.decision_boundary = 0
 self.left = None
 self.right = None
 self.min_leaf_size = min_leaf_size
 self.prediction = None

def mean_squared_error(self, labels, prediction):
 """
 mean_squared_error:
 @param labels: a one dimensional numpy array
 @param prediction: a floating point value
 return value: mean_squared_error calculates the error if prediction is used to
 estimate the labels
 >>> tester = DecisionTree()
 >>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
 >>> test_prediction = float(6)
 >>> tester.mean_squared_error(test_labels, test_prediction) == (
 ... TestDecisionTree.helper_mean_squared_error_test(test_labels,
 ... test_prediction))
 True
 >>> test_labels = np.array([1,2,3])
 >>> test_prediction = float(2)
 >>> tester.mean_squared_error(test_labels, test_prediction) == (
 ... TestDecisionTree.helper_mean_squared_error_test(test_labels,
 ... test_prediction))
 True
 """
 if labels.ndim != 1:
 print("Error: Input labels must be one dimensional")

return np.mean((labels - prediction) ** 2)

def train(self, x, y):
 """
 train:
 @param x: a one dimensional numpy array
 @param y: a one dimensional numpy array.
 The contents of y are the labels for the corresponding X values

train does not have a return value
 """

"""
 this section is to check that the inputs conform to our dimensionality
 constraints
 """
 if x.ndim != 1:
 print("Error: Input data set must be one dimensional")
 return
 if len(x) != len(y):
 print("Error: X and y have different lengths")
 return
 if y.ndim != 1:
 print("Error: Data set labels must be one dimensional")
 return

if len(x) < 2 * self.min_leaf_size:
 self.prediction = np.mean(y)
 return

if self.depth == 1:
 self.prediction = np.mean(y)
 return

best_split = 0
 min_error = self.mean_squared_error(x, np.mean(y)) * 2

"""
 loop over all possible splits for the decision tree. find the best split.
 if no split exists that is less than 2 * error for the entire array
 then the data set is not split and the average for the entire array is used as
 the predictor
 """
 for i in range(len(x)):
 if len(x[:i]) < self.min_leaf_size:
 continue
 elif len(x[i:]) < self.min_leaf_size:
 continue
 else:
 error_left = self.mean_squared_error(x[:i], np.mean(y[:i]))
 error_right = self.mean_squared_error(x[i:], np.mean(y[i:]))
 error = error_left + error_right
 if error < min_error:
 best_split = i
 min_error = error

if best_split != 0:
 left_x = x[:best_split]
 left_y = y[:best_split]
 right_x = x[best_split:]
 right_y = y[best_split:]

self.decision_boundary = x[best_split]
 self.left = DecisionTree(
 depth=self.depth - 1, min_leaf_size=self.min_leaf_size
 )
 self.right = DecisionTree(
 depth=self.depth - 1, min_leaf_size=self.min_leaf_size
 )
 self.left.train(left_x, left_y)
 self.right.train(right_x, right_y)
 else:
 self.prediction = np.mean(y)

return

def predict(self, x):
 """
 predict:
 @param x: a floating point value to predict the label of
 the prediction function works by recursively calling the predict function
 of the appropriate subtrees based on the tree's decision boundary
 """
 if self.prediction is not None:
 return self.prediction
 elif self.left or self.right is not None:
 if x >= self.decision_boundary:
 return self.right.predict(x)
 else:
 return self.left.predict(x)
 else:
 print("Error: Decision tree not yet trained")
 return None

class TestDecisionTree:
 """Decision Tres test class"""

@staticmethod
 def helper_mean_squared_error_test(labels, prediction):
 """
 helper_mean_squared_error_test:
 @param labels: a one dimensional numpy array
 @param prediction: a floating point value
 return value: helper_mean_squared_error_test calculates the mean squared error
 """
 squared_error_sum = float(0)
 for label in labels:
 squared_error_sum += (label - prediction) ** 2

return float(squared_error_sum / labels.size)

def main():
 """
 In this demonstration we're generating a sample data set from the sin function in
 numpy. We then train a decision tree on the data set and use the decision tree to
 predict the label of 10 different test values. Then the mean squared error over
 this test is displayed.
 """
 x = np.arange(-1.0, 1.0, 0.005)
 y = np.sin(x)

tree = DecisionTree(depth=10, min_leaf_size=10)
 tree.train(x, y)

test_cases = (np.random.rand(10) * 2) - 1
 predictions = np.array([tree.predict(x) for x in test_cases])
 avg_error = np.mean((predictions - test_cases) ** 2)

print("Test values: " + str(test_cases))
 print("Predictions: " + str(predictions))
 print("Average error: " + str(avg_error))

if __name__ == "__main__":
 main()
 import doctest

doctest.testmod(name="mean_squarred_error", verbose=True)

一键复制编辑原始数据按行查看历史

Tianyi Zheng 提交于 2022年10月18日 17:57 +08:00 . Remove depreciated np.float (#7394)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181

"""
Implementation of a basic regression decision tree.
Input data set: The input data set must be 1-dimensional with continuous labels.
Output: The decision tree maps a real number input to a real number output.
"""
import numpy as np


class DecisionTree:
 def __init__(self, depth=5, min_leaf_size=5):
 self.depth = depth
 self.decision_boundary = 0
 self.left = None
 self.right = None
 self.min_leaf_size = min_leaf_size
 self.prediction = None

 def mean_squared_error(self, labels, prediction):
 """
 mean_squared_error:
 @param labels: a one dimensional numpy array
 @param prediction: a floating point value
 return value: mean_squared_error calculates the error if prediction is used to
 estimate the labels
 >>> tester = DecisionTree()
 >>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
 >>> test_prediction = float(6)
 >>> tester.mean_squared_error(test_labels, test_prediction) == (
 ... TestDecisionTree.helper_mean_squared_error_test(test_labels,
 ... test_prediction))
 True
 >>> test_labels = np.array([1,2,3])
 >>> test_prediction = float(2)
 >>> tester.mean_squared_error(test_labels, test_prediction) == (
 ... TestDecisionTree.helper_mean_squared_error_test(test_labels,
 ... test_prediction))
 True
 """
 if labels.ndim != 1:
 print("Error: Input labels must be one dimensional")

 return np.mean((labels - prediction) ** 2)

 def train(self, x, y):
 """
 train:
 @param x: a one dimensional numpy array
 @param y: a one dimensional numpy array.
 The contents of y are the labels for the corresponding X values

 train does not have a return value
 """

 """
 this section is to check that the inputs conform to our dimensionality
 constraints
 """
 if x.ndim != 1:
 print("Error: Input data set must be one dimensional")
 return
 if len(x) != len(y):
 print("Error: X and y have different lengths")
 return
 if y.ndim != 1:
 print("Error: Data set labels must be one dimensional")
 return

 if len(x) < 2 * self.min_leaf_size:
 self.prediction = np.mean(y)
 return

 if self.depth == 1:
 self.prediction = np.mean(y)
 return

 best_split = 0
 min_error = self.mean_squared_error(x, np.mean(y)) * 2

 """
 loop over all possible splits for the decision tree. find the best split.
 if no split exists that is less than 2 * error for the entire array
 then the data set is not split and the average for the entire array is used as
 the predictor
 """
 for i in range(len(x)):
 if len(x[:i]) < self.min_leaf_size:
 continue
 elif len(x[i:]) < self.min_leaf_size:
 continue
 else:
 error_left = self.mean_squared_error(x[:i], np.mean(y[:i]))
 error_right = self.mean_squared_error(x[i:], np.mean(y[i:]))
 error = error_left + error_right
 if error < min_error:
 best_split = i
 min_error = error

 if best_split != 0:
 left_x = x[:best_split]
 left_y = y[:best_split]
 right_x = x[best_split:]
 right_y = y[best_split:]

 self.decision_boundary = x[best_split]
 self.left = DecisionTree(
 depth=self.depth - 1, min_leaf_size=self.min_leaf_size
 )
 self.right = DecisionTree(
 depth=self.depth - 1, min_leaf_size=self.min_leaf_size
 )
 self.left.train(left_x, left_y)
 self.right.train(right_x, right_y)
 else:
 self.prediction = np.mean(y)

 return

 def predict(self, x):
 """
 predict:
 @param x: a floating point value to predict the label of
 the prediction function works by recursively calling the predict function
 of the appropriate subtrees based on the tree's decision boundary
 """
 if self.prediction is not None:
 return self.prediction
 elif self.left or self.right is not None:
 if x >= self.decision_boundary:
 return self.right.predict(x)
 else:
 return self.left.predict(x)
 else:
 print("Error: Decision tree not yet trained")
 return None


class TestDecisionTree:
 """Decision Tres test class"""

 @staticmethod
 def helper_mean_squared_error_test(labels, prediction):
 """
 helper_mean_squared_error_test:
 @param labels: a one dimensional numpy array
 @param prediction: a floating point value
 return value: helper_mean_squared_error_test calculates the mean squared error
 """
 squared_error_sum = float(0)
 for label in labels:
 squared_error_sum += (label - prediction) ** 2

 return float(squared_error_sum / labels.size)


def main():
 """
 In this demonstration we're generating a sample data set from the sin function in
 numpy. We then train a decision tree on the data set and use the decision tree to
 predict the label of 10 different test values. Then the mean squared error over
 this test is displayed.
 """
 x = np.arange(-1.0, 1.0, 0.005)
 y = np.sin(x)

 tree = DecisionTree(depth=10, min_leaf_size=10)
 tree.train(x, y)

 test_cases = (np.random.rand(10) * 2) - 1
 predictions = np.array([tree.predict(x) for x in test_cases])
 avg_error = np.mean((predictions - test_cases) ** 2)

 print("Test values: " + str(test_cases))
 print("Predictions: " + str(predictions))
 print("Average error: " + str(avg_error))


if __name__ == "__main__":
 main()
 import doctest

 doctest.testmod(name="mean_squarred_error", verbose=True)