- 
  Notifications
 You must be signed in to change notification settings 
- Fork 3.8k
update forks #51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
 
  Open
 
 
 
 
  Open
 update forks #51
Changes from all commits
 Commits
 
 
 Show all changes
 
 
 7 commits
 
 
 Select commit
 Hold shift + click to select a range
 
 bf211c0
 
 Merge pull request #1 from ddbourgin/master
 
 
 bruce1408 801dc40
 
 Merge pull request #2 from ddbourgin/master
 
 
 bruce1408 497aad3
 
 Merge pull request #3 from ddbourgin/master
 
 
 bruce1408 a41fc1d
 
 Merge pull request #4 from ddbourgin/master
 
 
 bruce1408 3bd5e00
 
 skl-tree-impl
 
 
 cca1baa
 
 new code info
 
 
 4cd83ce
 
 Merge pull request #5 from ddbourgin/master
 
 
 bruce1408 File filter
Filter by extension
Conversations
 Failed to load comments. 
 
 
 
  Loading
 
 Jump to
 
 Jump to file
 
 
 
 Failed to load files. 
 
 
 
  Loading
 
 Diff view
Diff view
There are no files selected for viewing
 
 
 
 4 changes: 3 additions & 1 deletion
 
 
 
 .gitignore
 
 
 
 
  
 
 
 
 
 
 
 This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
 Learn more about bidirectional Unicode characters
 
 
 
 
 
 
 
 
 96 changes: 96 additions & 0 deletions
 
 
 
 numpy_ml/trees/cart_tree_imp.py
 
 
 
 
  
 
 
 
 
 
 
 This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
 Learn more about bidirectional Unicode characters
 
 
 
 
 | Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| from numpy import * | ||
|  | ||
|  | ||
| # 载入数据 | ||
| def loadDataSet(fileName): | ||
| dataMat = [] | ||
| fr = open(fileName) | ||
| for line in fr.readlines(): | ||
| curLine = line.strip().split('\t') | ||
| # python3不适用:fltLine = map(float,curLine) 修改为: | ||
| fltLine = list(map(float, curLine)) # 将每行映射成浮点数,python3返回值改变,所以需要 | ||
| dataMat.append(fltLine) | ||
| return dataMat | ||
|  | ||
|  | ||
| # 切分数据集为两个子集 | ||
| def binSplitDataSet(dataSet, feature, value): # 数据集 待切分特征 特征值 | ||
| mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :] | ||
| mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :] | ||
| # 下面原书代码报错 index 0 is out of bounds,使用上面两行代码 | ||
| # mat0 = dataSet[nonzero(dataSet[:, feature] > value)[0], :][0] | ||
| # mat1 = dataSet[nonzero(dataSet[:, feature] <= value)[0], :][0] | ||
| return mat0, mat1 | ||
|  | ||
|  | ||
| # Tree结点类型:回归树 | ||
| def regLeaf(dataSet): # 生成叶结点,在回归树中是目标变量特征的均值 | ||
| return mean(dataSet[:, -1]) | ||
|  | ||
|  | ||
| # 误差计算函数:回归误差 | ||
| def regErr(dataSet): # 计算目标的平方误差(均方误差*总样本数) | ||
| return var(dataSet[:, -1]) * shape(dataSet)[0] | ||
|  | ||
|  | ||
| # 二元切分 | ||
| def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(0, 1)): | ||
| # 切分特征的参数阈值,用户初始设置好 | ||
| tolS = ops[0] # 允许的误差下降值 | ||
| tolN = ops[1] # 切分的最小样本数 | ||
| # 若所有特征值都相同,停止切分 | ||
| featureNum = len(set(dataSet[:, -1].T.tolist()[0])) | ||
| if len(set(dataSet[:, -1].T.tolist()[0])) == 1: # 倒数第一列转化成list 不重复 | ||
| return None, leafType(dataSet) # 如果剩余特征数为1,停止切分1。 | ||
| # 找不到好的切分特征,调用regLeaf直接生成叶结点 | ||
| m, n = shape(dataSet) | ||
| S = errType(dataSet) # 最好的特征通过计算平均误差 | ||
| bestS = inf | ||
| bestIndex = 0 | ||
| bestValue = 0 | ||
| for featIndex in range(n - 1): # 遍历数据的每个属性特征 | ||
| # for splitVal in set(dataSet[:,featIndex]): python3报错修改为下面 | ||
| for splitVal in set((dataSet[:, featIndex].T.A.tolist())[0]): # 遍历每个特征里不同的特征值 | ||
| mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) # 对每个特征进行二元分类 | ||
| if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue | ||
| newS = errType(mat0) + errType(mat1) | ||
| if newS < bestS: # 更新为误差最小的特征 | ||
| bestIndex = featIndex | ||
| bestValue = splitVal | ||
| bestS = newS | ||
| # 如果切分后误差效果下降不大,则取消切分,直接创建叶结点 | ||
| if (S - bestS) < tolS: | ||
| return None, leafType(dataSet) # 停止切分2 | ||
| mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) | ||
| # 判断切分后子集大小,小于最小允许样本数停止切分3 | ||
| if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): | ||
| return None, leafType(dataSet) | ||
| return bestIndex, bestValue # 返回特征编号和用于切分的特征值 | ||
|  | ||
|  | ||
| # 构建tree | ||
| def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(0, 1)): | ||
| # 数据集默认NumPy Mat 其他可选参数【结点类型:回归树,误差计算函数,ops包含树构建所需的其他元组】 | ||
| feat, val = chooseBestSplit(dataSet, leafType, errType, ops) | ||
| if feat == None: return val # 满足停止条件时返回叶结点值 | ||
| # 切分后赋值 | ||
| retTree = {} | ||
| retTree['spInd'] = feat | ||
| retTree['spVal'] = val | ||
| # 切分后的左右子树 | ||
| lSet, rSet = binSplitDataSet(dataSet, feat, val) | ||
| retTree['left'] = createTree(lSet, leafType, errType, ops) | ||
| retTree['right'] = createTree(rSet, leafType, errType, ops) | ||
| return retTree | ||
|  | ||
|  | ||
| if __name__ == "__main__": | ||
| myDat = mat(loadDataSet('train_data')) | ||
| print(createTree(myDat)) | ||
|  | ||
| # 绘制数据点图 | ||
| import matplotlib.pyplot as plt | ||
|  | ||
| plt.plot(myDat[:, 0], myDat[:, 1], 'ro') | ||
| plt.show() | ||
|  | 
 
 
 
 40 changes: 40 additions & 0 deletions
 
 
 
 numpy_ml/trees/scikit_learn_tree.py
 
 
 
 
  
 
 
 
 
 
 
 This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
 Learn more about bidirectional Unicode characters
 
 
 
 
 | Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| import numpy as np | ||
| import matplotlib.pyplot as plt | ||
| from sklearn.tree import DecisionTreeRegressor | ||
| from sklearn import linear_model | ||
|  | ||
| # Data set,skl官方代码给出样例 | ||
| x = np.array(list(range(1, 11))).reshape(-1, 1) | ||
| y = np.array([5.56, 5.70, 5.91, 6.40, 6.80, 7.05, 8.90, 8.70, 9.00, 9.05]).ravel() | ||
|  | ||
| # Fit regression model 和手工计算的一样,是个三段函数 | ||
| # x≤3.5 5.72 | ||
| # 3.5⩽x≤6.5 6.75 | ||
| # 6.5 < x 8.91 | ||
|  | ||
| model1 = DecisionTreeRegressor(max_depth=1) | ||
| model2 = DecisionTreeRegressor(max_depth=3, max_leaf_nodes=4, min_samples_leaf=3) | ||
| model3 = linear_model.LinearRegression() | ||
| model1.fit(x, y) | ||
| model2.fit(x, y) | ||
| model3.fit(x, y) | ||
|  | ||
| # Predict | ||
| X_test = np.arange(0.0, 10.0, 0.01)[:, np.newaxis] | ||
| y_1 = model1.predict(X_test) | ||
| y_2 = model2.predict(X_test) | ||
| y_3 = model3.predict(X_test) | ||
|  | ||
| # Plot the results | ||
| plt.figure() | ||
| plt.scatter(x, y, s=20, edgecolor="black", | ||
| c="darkorange", label="data") | ||
| plt.plot(X_test, y_1, color="cornflowerblue", | ||
| label="max_depth=1", linewidth=2) | ||
| plt.plot(X_test, y_2, color="yellowgreen", label="max_depth=3", linewidth=2) | ||
| plt.plot(X_test, y_3, color='red', label='liner regression', linewidth=2) | ||
| plt.xlabel("data") | ||
| plt.ylabel("target") | ||
| plt.title("Decision Tree Regression") | ||
| plt.legend() | ||
| plt.show() | 
 
 
 
 10 changes: 10 additions & 0 deletions
 
 
 
 numpy_ml/trees/train_data
 
 
 
 
  
 
 
 
 
 
 
 This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
 Learn more about bidirectional Unicode characters
 
 
 
 
 | Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,10 @@ | ||
| 1 4.50 | ||
| 2 4.75 | ||
| 3 4.91 | ||
| 4 5.34 | ||
| 5 5.80 | ||
| 6 7.05 | ||
| 7 7.90 | ||
| 8 8.23 | ||
| 9 8.70 | ||
| 10 9.00 | 
 Add this suggestion to a batch that can be applied as a single commit.
 This suggestion is invalid because no changes were made to the code.
 Suggestions cannot be applied while the pull request is closed.
 Suggestions cannot be applied while viewing a subset of changes.
 Only one suggestion per line can be applied in a batch.
 Add this suggestion to a batch that can be applied as a single commit.
 Applying suggestions on deleted lines is not supported.
 You must change the existing code in this line in order to create a valid suggestion.
 Outdated suggestions cannot be applied.
 This suggestion has been applied or marked resolved.
 Suggestions cannot be applied from pending reviews.
 Suggestions cannot be applied on multi-line comments.
 Suggestions cannot be applied while the pull request is queued to merge.
 Suggestion cannot be applied right now. Please check back later.