diff --git a/Alibaba-PuHuiTi-Medium.ttf b/Alibaba-PuHuiTi-Medium.ttf
new file mode 100644
index 0000000..7dc8134
Binary files /dev/null and b/Alibaba-PuHuiTi-Medium.ttf differ
diff --git a/README.md b/README.md
index c0e828a..8526d25 100755
--- a/README.md
+++ b/README.md
@@ -12,4 +12,12 @@ Python当中的使用的一些模块、类和方法
 4. TK-Plots 一个使用 TK 做的公式图打印的程序
 5. notebook 在 JupyterNotebook 中可以直接使用的一些操作
 6. plt 对于 Matplotlib 的 plt 的一些常用封装
-7. img 关于 Image 的一些操作,主要是 PIL 中的
\ No newline at end of file
+7. img 关于 Image 的一些操作,主要是 PIL 中的
+
+## JupyterNotebook 中使用主题
+
+```shell
+pip install jupyterthemes --user
+jt -l
+jt -t grade3 -fs 95 -altp -tfs 11 -nfs 115 -cellw 95% -T
+```
\ No newline at end of file
diff --git a/datetime/__init__.py b/datetime/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/datetime/dt.py b/datetime/dt.py
new file mode 100644
index 0000000..f11f144
--- /dev/null
+++ b/datetime/dt.py
@@ -0,0 +1,31 @@
+import time,datetime
+def getTime():
+ ########## 加入时间获取
+ # 今天日期
+ today = datetime.date.today()
+ # 昨天时间
+ yesterday = today - datetime.timedelta(days=1)
+
+ # 明天时间
+ tomorrow = today + datetime.timedelta(days=1)
+ acquire = today + datetime.timedelta(days=2)
+
+ # 昨天开始时间戳
+ yesterday_start_time = int(time.mktime(time.strptime(str(yesterday), '%Y-%m-%d')))
+ # 昨天结束时间戳
+ yesterday_end_time = int(time.mktime(time.strptime(str(today), '%Y-%m-%d'))) - 1
+
+ # 今天开始时间戳
+ today_start_time = yesterday_end_time + 1
+ # 今天结束时间戳
+ today_end_time = int(time.mktime(time.strptime(str(tomorrow), '%Y-%m-%d'))) - 1
+
+ # 明天开始时间戳
+ tomorrow_start_time = int(time.mktime(time.strptime(str(tomorrow), '%Y-%m-%d')))
+ # 明天结束时间戳
+ tomorrow_end_time = int(time.mktime(time.strptime(str(acquire), '%Y-%m-%d'))) - 1
+ ########## 加入时间获取
+
+ return today,yesterday,tomorrow,acquire,\
+ today_start_time,today_end_time,yesterday_start_time,yesterday_end_time,\
+ tomorrow_start_time,tomorrow_end_time
\ No newline at end of file
diff --git a/demos/Hello.py b/demos/Hello.py
new file mode 100644
index 0000000..4287ca8
--- /dev/null
+++ b/demos/Hello.py
@@ -0,0 +1 @@
+#
\ No newline at end of file
diff --git a/demos/__init__.py b/demos/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/log/__init__.py b/log/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/log/graylog.py b/log/graylog.py
new file mode 100644
index 0000000..bbdb72b
--- /dev/null
+++ b/log/graylog.py
@@ -0,0 +1,10 @@
+import logging
+import graypy
+def log2gray(content,serverIP,ports =12201,inputName='GrayLogMadeByPython'):
+ my_logger = logging.getLogger('test_logger')
+ my_logger.setLevel(logging.DEBUG)
+
+ handler = graypy.GELFUDPHandler(serverIP, ports, localname=inputName)
+ my_logger.addHandler(handler)
+
+ my_logger.debug(content)
\ No newline at end of file
diff --git a/machine_learning/__init__.py b/machine_learning/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/machine_learning/demo.py b/machine_learning/demo.py
new file mode 100644
index 0000000..65fca9f
--- /dev/null
+++ b/machine_learning/demo.py
@@ -0,0 +1,6 @@
+import pandas as pd
+import numpy as np
+
+datas = pd.read_csv("../competition/LincolnTemp.csv")
+
+print(datas)
\ No newline at end of file
diff --git a/machine_learning/ml_classifier/__init__.py b/machine_learning/ml_classifier/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/machine_learning/ml_classifier/classification.py b/machine_learning/ml_classifier/classification.py
new file mode 100644
index 0000000..8decbba
--- /dev/null
+++ b/machine_learning/ml_classifier/classification.py
@@ -0,0 +1,266 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+from scipy.stats import ttest_ind
+from scipy.stats import f_oneway
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.linear_model import LogisticRegression
+
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import BernoulliNB
+from sklearn.naive_bayes import GaussianNB
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from statsmodels.formula.api import ols
+from statsmodels.sandbox.stats.multicomp import MultiComparison
+from statsmodels.stats.anova import anova_lm
+from statsmodels.stats.multitest import multipletests
+
+
+import numpy as np
+
+
+# 这个类使用了反射,比较难。
+class ClassifierCollection:
+
+ ## KNN
+ @staticmethod
+ def KNN(X, y, XX): # X,y 分别为训练数据集的数据和标签,XX为测试数据
+ """
+ KNN 分类
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = KNeighborsClassifier(n_neighbors=10) # 默认为5
+ model.fit(X, y)
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def SVM(X, y, XX):
+ """
+ SVM 分类
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = SVC(C=5.0)
+ model.fit(X, y)
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def LR(X, y, XX):
+ """
+ LogisticRegression
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = LogisticRegression(solver='lbfgs', max_iter=1000)
+ model.fit(X, y)
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def CART(X, y, XX):
+ """
+ 决策树(CART)
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = DecisionTreeClassifier()
+ model.fit(X, y)
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def CARTTuning(X, y, XX):
+ """
+ 决策树(CART)
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = DecisionTreeClassifier(random_state=42, max_depth=10, max_leaf_nodes=120)
+ model.fit(X, y)
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def RF(X, y, XX):
+ """
+ 随机森林
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = RandomForestClassifier()
+
+ model.fit(X, y)
+ # print("RandomForestClassifier 使用的分类器")
+ # print(model.estimators_)
+ # print("RandomForestClassifier 使用的分类器")
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def RFTuning(X, y, XX):
+ """
+ 随机森林
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = RandomForestClassifier(random_state=42, max_depth=10, max_leaf_nodes=120)
+
+ model.fit(X, y)
+ # print("RandomForestClassifier 使用的分类器")
+ # print(model.estimators_)
+ # print("RandomForestClassifier 使用的分类器")
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def GBDT(X, y, XX):
+ """
+ (Gradient Boosting Decision Tree)
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = GradientBoostingClassifier()
+ # https://blog.csdn.net/tuanzide5233/article/details/104234246
+
+ model.fit(X, y)
+ # print("GradientBoostingClassifier 使用的分类器")
+ # print(model.estimators_)
+ # print("GradientBoostingClassifier 使用的分类器")
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def GBDTTuning(X, y, XX):
+ """
+ (Gradient Boosting Decision Tree)
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = GradientBoostingClassifier(random_state=42,
+ max_depth=10,
+ max_leaf_nodes=120,
+ n_estimators=100)
+ # https://blog.csdn.net/tuanzide5233/article/details/104234246
+
+ model.fit(X, y)
+ # print("GradientBoostingClassifier 使用的分类器")
+ # print(model.estimators_)
+ # print("GradientBoostingClassifier 使用的分类器")
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def GNB(X, y, XX):
+ """
+ 基于高斯分布求概率
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = GaussianNB()
+ model.fit(X, y)
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def MNB(X, y, XX):
+ """
+ 基于多项式分布求概率
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = MultinomialNB()
+ model.fit(X, y)
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def BNB(X, y, XX):
+ """
+ 基于伯努利分布求概率
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ model = BernoulliNB()
+ model.fit(X, y)
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def AdaBoost(X, y, XX):
+ """
+ AdaBoost分类器
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ # 要把随机森林、GBDT、AdaBoost的弱分类器设置成CART 2023年02月17日 [DONE]
+ # ![soRpwZ](https://oss.images.shujudaka.com/uPic/soRpwZ.png)
+ model = AdaBoostClassifier(n_estimators=10, random_state=0)
+
+ # AdaBoostClassifier默认使用CART分类树DecisionTreeClassifier,
+ # 而AdaBoostRegressor默认使用CART回归树DecisionTreeRegressor。
+ model.fit(X, y)
+ # print("AdaBoostClassifier 使用的分类器")
+ # print(model.estimators_)
+ # print("AdaBoostClassifier 使用的分类器")
+ predicted = model.predict(XX)
+ return predicted, model
+
+ @staticmethod
+ def AdaBoostTuning(X, y, XX):
+ """
+ AdaBoost分类器
+ :param X:
+ :param y:
+ :param XX:
+ :return:
+ """
+ # 要把随机森林、GBDT、AdaBoost的弱分类器设置成CART 2023年02月17日 [DONE]
+ # ![soRpwZ](https://oss.images.shujudaka.com/uPic/soRpwZ.png)
+ model = AdaBoostClassifier(random_state=42,
+ base_estimator=DecisionTreeClassifier(max_depth=10,max_leaf_nodes=120))
+
+ # AdaBoostClassifier默认使用CART分类树DecisionTreeClassifier,
+ # 而AdaBoostRegressor默认使用CART回归树DecisionTreeRegressor。
+ model.fit(X, y)
+ # print("AdaBoostClassifier 使用的分类器")
+ # print(model.estimators_)
+ # print("AdaBoostClassifier 使用的分类器")
+ predicted = model.predict(XX)
+ return predicted, model
+
+
+
+
diff --git a/machine_learning/ml_regression/__init__.py b/machine_learning/ml_regression/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/machine_learning/ml_regression/regression.py b/machine_learning/ml_regression/regression.py
new file mode 100644
index 0000000..7521be3
--- /dev/null
+++ b/machine_learning/ml_regression/regression.py
@@ -0,0 +1,361 @@
+# 回归算法在scikit-learn中的使用方法:
+#
+# 主要参考网页:https://ster.im/py_sklearn_1/
+# 基础模型:
+#
+# * 线性回归(包含岭回归、Lasso回归、弹性网络回归)
+# * 树回归
+# * 支持向量机回归
+# * K近邻回归
+#
+# 集成模型:
+#
+# * 随机森林回归
+# * 极端随机树回归
+# * AdaBoost回归
+# * Gradient Boosting回归
+
+from sklearn.linear_model import LinearRegression
+from sklearn.linear_model import RidgeCV
+from sklearn.linear_model import LassoCV
+from sklearn.linear_model import ElasticNetCV
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.svm import SVR
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.ensemble import BaggingRegressor
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.ensemble import ExtraTreesRegressor
+from sklearn.ensemble import AdaBoostRegressor
+from sklearn.ensemble import GradientBoostingRegressor
+
+
+
+# 这个类使用了反射,比较难。
+class RegressorCollection:
+ @staticmethod
+ def Linear(X_train,y_train,X_test,y_test=None):
+ """
+ 最小二乘法线性回归
+ 最基本的线性回归法,它接收如下的几个参数:
+
+ fit_intercept:是否考察截距项b,默认为True。
+ normalize:是否先对数据进行Z-score标准化,默认为False。
+ copy_X:默认为True则复制X,否则直接在原X上覆写。
+ n_jobs:使用的处理器核数,默认None表示单核。
+ """
+ reg = LinearRegression(fit_intercept=True, copy_X=True, n_jobs=None)
+ reg.fit(X_train, y_train)
+ if y_test is None: # 如果为 None 的话,表示操作,否则是测试。
+ predicted = reg.predict(X_test)
+ return predicted,reg
+ else:
+ reg.score(X_test, y_test) # 回归模型score返回的是R方,下同
+ # 各特征的系数w
+ print("各特征的系数w")
+ print(reg.coef_)
+ # 截距b
+ print("截距b")
+ print(reg.intercept_)
+ return None,None
+
+ @staticmethod
+ def Ridge(X_train,y_train,X_test,y_test=None):
+ """
+ 岭回归
+ 带L2正则项的线性回归,相比LinearRegression主要多一个正则项系数
+ α
+ 的参数。
+
+ 与Ridge相比,RidgeCV内置了交叉验证,会自动帮我们筛出
+ α
+ 的最优解,省去了超参数调试的麻烦,因此通常采用后者。
+ """
+ reg = RidgeCV(alphas=(0.1, 1.0, 10.0), fit_intercept=True,
+ scoring=None, cv=5, gcv_mode=None, store_cv_values=False)
+ reg.fit(X_train, y_train)
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ # 正则项系数alpha
+ print("正则项系数alpha")
+ print(reg.alpha_)
+ return None,None
+
+ @staticmethod
+ def Lasso(X_train,y_train,X_test,y_test=None):
+ """
+ 带L1正则项的线性回归,常用来估计稀疏参数的高维线性模型。
+
+ 供有Lasso、LassoCV、LassoLars、LassoLarsCV、LassoLarsIC五种可供选择,
+ 带CV的即自动选择最优的正则项系数,带Lars的采用最小角回归法而不带Lars的采用坐标轴
+ 下降法进行损失函数优化。LassoLarsIC采用AIC(Akaike信息准则)或BIC(Bayes信息准则)
+ 确定正则项系数。在大多数回归任务中,首选LassoCV,次选LassoLarsCV。
+ """
+ reg = LassoCV(eps=0.001, n_alphas=100, alphas=None, fit_intercept=True,
+ precompute="auto", max_iter=1000, tol=0.0001,
+ copy_X=True, cv=5, verbose=False, n_jobs=None,
+ positive=False, random_state=None, selection="cyclic")
+ reg.fit(X_train, y_train)
+
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ return None,None
+
+ @staticmethod
+ def Elastic(X_train,y_train,X_test,y_test=None):
+
+ """
+ 弹性网络回归
+ 同时带有L1和L2正则项的线性回归,使用l1_ratio这一权重参数来分配L1和L2正则项的比重。
+ 常用ElasticNetCV,它会自动选择正则项系数和平衡权重。
+ """
+ reg = ElasticNetCV(l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None,
+ fit_intercept=True, precompute="auto",
+ max_iter=1000, tol=0.0001, cv=5, copy_X=True, verbose=0,
+ n_jobs=None, positive=False, random_state=None, selection="cyclic")
+ reg.fit(X_train, y_train)
+
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ return None, None
+
+ @staticmethod
+ def DecisionTree(X_train,y_train,X_test,y_test=None):
+ """
+ 树回归
+ CART用于回归时,参数与分类器类似,它可以接收如下的参数:
+
+ criterion:分枝的标准,默认"mse"为均方差,可选"friedman_mse"(Friedman均方差)或者"mae"(绝对平均误差)。通常采用默认值。
+ splitter:分枝的策略,默认"best"在所有划分点中找出最优的划分点,适合样本量不大的情况。样本量巨大时建议选择"random",在部分划分点中找局部最优的划分点。
+ max_depth:限制树的最大深度,默认值为None。如果样本和特征很多时可以适当限制树的最大深度。
+ min_samples_split:分割一个节点所需的最小样本数,默认为2,当样本量非常大时可以增加这个值。
+ min_samples_leaf:叶节点上所需的最小样本数,叶节点样本数少于这个值时会被剪枝。默认为1,当样本量非常大时可以增加这个值。
+ min_weight_fraction_leaf:叶节点样本权重和所需的最小值,默认为0即视样本具有相同的权重。
+ max_features:分枝时考虑的特征数量最大值,默认"auto"即该值等于特征数量。可以指定整数或者浮点数(表示占特征总数的比例)。也可选"sqrt"(特征数的开根)、"log2"(特征数的对数)、None(等于特征数)。如果特征数较多可以考虑限制以加快模型拟合。
+ random_state:随机数种子。
+ max_leaf_nodes:叶节点数最大值,默认None不对叶节点数量做限制,如果特征较多可以加以限制。
+ min_impurity_decrease:默认为0.,如果分枝导致不纯度的减少大于等于该值,则节点将被分枝。
+ min_impurity_split:默认为1e-7,如果某节点的不纯度超过这个阈值,则该节会分枝,否则该节点为叶节点。
+ presort:是否对数据进行预排序,以加快寻找最佳分割点。默认为False。当使用小数据集或对深度作限制时,设置为True可能会加速训练,但对于大型数据集则反而会变慢。
+ 我们超参数调优的主要对象为max_depth、min_samples_split、min_samples_leaf、max_features。
+ """
+ # The 'criterion' parameter of DecisionTreeRegressor
+ # must be a str among
+ # {'friedman_mse', 'poisson', 'absolute_error', 'squared_error'}.
+ # Got 'mse' instead.
+ reg = DecisionTreeRegressor(criterion="friedman_mse", splitter="best", max_depth=None,
+ min_samples_split=2, min_samples_leaf=1,
+ min_weight_fraction_leaf=0.0, max_features=None,
+ random_state=None, max_leaf_nodes=None,
+ min_impurity_decrease=0.0)
+ reg.fit(X_train, y_train)
+
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ return None, None
+
+ @staticmethod
+ def SVR(X_train,y_train,X_test,y_test=None):
+ """
+ 支持向量机回归
+ 部分参数如下:
+
+ kernel:核函数,默认使用"rbf"径向基函数,可选"linear"、"poly"、"sigmoid"、"precomputed"或者一个可调用的函数。
+ degree:多项式核函数的维度d,仅在核函数选择"poly"时有效。默认值为3。
+ gamma:"rbf"、"poly"、"sigmoid"的系数gamma,默认为"auto",取特征数量的倒数。
+ coef0:核函数中的独立项,仅在核函数选择"poly"、"sigmoid"时有效。默认值为0.0。
+ tol:停止训练的误差精度,默认值为1e-3。
+ C:惩罚系数C,默认值为1.0。
+ max_iter:最大迭代次数,默认为-1即无限制。
+ 最重要的两个调参对象是gamma和C。gamma越大,支持向量越少,gamma越小,支持向量越多。C可理解为逻辑回归中正则项系数lambda的倒数,C过大容易过拟合,C过小容易欠拟合。通常采用网格搜索法进行调参。
+ """
+ reg = SVR(kernel="rbf", degree=3, gamma="auto", coef0=0.0,
+ tol=0.001, C=1.0, epsilon=0.1, shrinking=True,
+ cache_size=200, verbose=False, max_iter=-1)
+ reg.fit(X_train, y_train)
+
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ return None, None
+
+ @staticmethod
+ def KNN(X_train,y_train,X_test,y_test=None):
+ """
+ K近邻回归
+ 部分参数如下:
+
+ n_neighbors:最近邻单元的个数K。
+ weights:是否考虑邻居的权重,默认值"uniform"视每个邻居的权重相等,"distance"则给较近的单元更大的权重(取距离的倒数),也可以指定一个可调用的函数。
+ algorithm:计算最近邻的算法,默认"auto"自动挑选模型认为最合适的,可选"ball_tree"、"kd_tree"、"brute"。
+ leaf_size:叶节点数量,默认值30,只有在algorithm选择球树或者KD树时有效。
+ p:闵式距离的度量,p=1时为曼哈顿距离,p=2时为欧式距离(默认)。
+ n_neighbors是最需要关注的超参数,其次weights和p也可以适当调整。
+ """
+ reg = KNeighborsRegressor(n_neighbors=5, weights="uniform", algorithm="auto",
+ leaf_size=30, p=2, metric="minkowski", metric_params=None)
+ reg.fit(X_train, y_train)
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ return None, None
+
+ @staticmethod
+ def Bagging(X_train,y_train,X_test,y_test=None):
+ """
+ 集成回归模型:Bagging
+ Bagging回归
+ 参数:
+
+ base_estimator:基模型,默认None代表决策树,可选择其它基础回归模型对象。
+ n_estimators:基模型的数量,默认为10。
+ max_samples:用于训练基模型的从X_train中抽取样本的数量,可以是整数代表数量,也可以是浮点数代表比例,默认为1.0。
+ max_features:用于训练基模型的从X_train中抽取特征的数量,可以是整数代表数量,也可以是浮点数代表比例,默认为1.0。
+ bootstrap:对于样本是否有放回抽样,默认为True。
+ bootstrap_features:对于特征是否有放回抽样,默认为False。
+ oob_score:是否使用包外样本估计泛化误差。
+ warm_start:默认为False,如果选择True,下一次训练以上一次模型的参数为初始参数。
+ 对于所有的集成模型,最需要关注的超参数是n_estimators,即基模型的数量,通常需要使用网格搜索法寻找最优解;其他的参数通常保持默认即可取得较好的效果。
+ """
+ reg = BaggingRegressor(base_estimator=None, n_estimators=10, max_samples=1.0,
+ max_features=1.0, bootstrap=True, bootstrap_features=False,
+ oob_score=False, warm_start=False, random_state=None, verbose=0)
+ reg.fit(X_train, y_train)
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ return None, None
+
+ @staticmethod
+ def RF(X_train,y_train,X_test,y_test=None):
+ """
+ 随机森林回归
+ 参数:
+
+ n_estimators:树的数量,默认为10。
+ criterion:分枝的标准,默认"mse"为均方差,可选"mae"(绝对平均误差)。
+ max_depth:限制树的最大深度,默认值为None,表示一直分枝直到所有叶节点都是纯的,或者所有叶节点的样本数小于min_samples_split。
+ min_samples_split:分割一个节点所需的最小样本数,默认为2。
+ min_samples_leaf:叶节点上所需的最小样本数,叶节点样本数少于这个值时会被剪枝。默认为1。
+ min_weight_fraction_leaf:叶节点样本权重和所需的最小值,默认为0即视样本具有相同的权重。
+ max_features:分枝时考虑的特征数量最大值,默认"auto"即该值等于特征数量。可以指定整数或者浮点数(表示占特征总数的比例)。也可选"sqrt"(特征数的开根)、"log2"(特征数的对数)、None(等于特征数)。
+ max_leaf_nodes:叶节点数最大值,默认None不对叶节点数量做限制。
+ min_impurity_decrease:默认为0,如果分枝导致不纯度的减少大于等于该值,则节点将被分枝。
+ min_impurity_split:默认为1e-7,如果某节点的不纯度超过这个阈值,则该节会分枝,否则该节点为叶节点。
+ bootstrap:对于样本是否有放回抽样,默认为True。如果为False,则使用整个数据集构建每个树。
+ oob_score:是否使用包外样本估计R方。默认为False。
+ random_state:随机数种子。
+ warm_start:默认为False,如果选择True,下一次训练以上一次模型的参数为初始参数。
+ 除了n_estimators之外,还可以考虑适当调整max_depth、min_samples_split、min_samples_leaf、max_features这些决策树的参数。
+ """
+ # The 'criterion' parameter of RandomForestRegressor
+ # must be a str among {'friedman_mse', 'poisson', 'absolute_error', 'squared_error'}. Got 'mse' instead.
+ reg = RandomForestRegressor(n_estimators=10, criterion="friedman_mse", max_depth=None,
+ min_samples_split=2, min_samples_leaf=1,
+ min_weight_fraction_leaf=0.0,
+ max_leaf_nodes=None, min_impurity_decrease=0.0,
+ bootstrap=True, oob_score=False,
+ random_state=None, verbose=0, warm_start=False)
+ reg.fit(X_train, y_train)
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ # 各特征的重要性
+ reg.feature_importances_
+ print("# 各特征的重要性")
+ print(reg.feature_importances_)
+ return None, None
+
+ @staticmethod
+ def ExtraTree(X_train,y_train,X_test,y_test=None):
+ """
+ 极端随机树回归
+ Extra Tree和随机森林的区别较小,参数几乎一致。
+ """
+ # The 'criterion' parameter of ExtraTreesRegressor must be a str among
+ # {'poisson', 'squared_error', 'absolute_error', 'friedman_mse'}. Got 'mse' instead.
+ reg = ExtraTreesRegressor(n_estimators=10, criterion="friedman_mse", max_depth=None,
+ min_samples_split=2, min_samples_leaf=1,
+ min_weight_fraction_leaf=0.0,
+ max_leaf_nodes=None, min_impurity_decrease=0.0,
+ bootstrap=False, oob_score=False,
+ random_state=None, verbose=0, warm_start=False)
+ reg.fit(X_train, y_train)
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ return None, None
+
+ @staticmethod
+ def ADA(X_train,y_train,X_test,y_test=None):
+ """
+ AdaBoost回归
+ 参数:
+
+ base_estimator:弱回归学习器,可指定为任意回归模型对象,默认为None,即DecisionTreeRegressor(max_depth=3)。
+ n_estimators:最大迭代次数,即弱学习器的最大个数,默认为50。
+ learning_rate:每个弱学习器的权重缩减系数,介于0.和1.之间,默认为1.。
+ loss:每次迭代后更新权重时采用的损失函数,默认为"linear",可选"square"、"exponential",通常使用默认值。
+ random_state:随机数种子。
+ n_estimators和learning_rate两个参数相互牵制,通常会一起进行调参。
+ """
+ reg = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=3), n_estimators=50,
+ learning_rate=1.0, loss="linear")
+ reg.fit(X_train, y_train)
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ return None, None
+
+ @staticmethod
+ def GB(X_train,y_train,X_test,y_test=None):
+ """
+ Gradient Boosting回归
+ 其中决策树部分的参数不列举。
+
+ loss:损失函数,默认值"ls"代表最小二乘回归,可选"lad"(最小绝对偏差)、"huber"(前两者的结合)和"quantile"(分位数回归)。
+ learning_rate:每棵树的权重缩减系数,默认为0.1,与n_estimators相互牵制,是调参的重点。
+ n_estimators:最大迭代次数,默认为100。
+ subsample:子采样率,用于训练每棵树的样本占样本总数的比例,默认为1.0,如使用小于1.0的值,该模型就为随机梯度提升,会减少方差、增大偏差。
+ init:默认为None,可指定具有fit和predict方法的预测器对象,它用于初始化参数。
+ """
+ # The 'loss' parameter of GradientBoostingRegressor must be a str among
+ # {'squared_error', 'huber', 'quantile', 'absolute_error'}. Got 'ls' instead.
+ reg = GradientBoostingRegressor(loss="squared_error", learning_rate=0.1, n_estimators=100,
+ subsample=1.0, criterion="friedman_mse", min_samples_split=2,
+ min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3,
+ min_impurity_decrease=0.0, init=None,
+ random_state=None, max_features=None, alpha=0.9, verbose=0,
+ max_leaf_nodes=None, warm_start=False,
+ validation_fraction=0.1, n_iter_no_change=None, tol=0.0001)
+ reg.fit(X_train, y_train)
+ if y_test is None:
+ predicted = reg.predict(X_test)
+ return predicted, reg
+ else:
+ reg.score(X_test, y_test)
+ return None, None
+
diff --git a/machine_learning/ml_result/__init__.py b/machine_learning/ml_result/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/machine_learning/ml_result/ml_caller.py b/machine_learning/ml_result/ml_caller.py
new file mode 100644
index 0000000..3f79f88
--- /dev/null
+++ b/machine_learning/ml_result/ml_caller.py
@@ -0,0 +1,362 @@
+# 一个调度和输出的类
+
+from sklearn.model_selection import learning_curve # 导入学习曲线类
+from machine_learning.model_dump_load import dump_model
+from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.font_manager import FontProperties
+# !wget http://tianchi-media.oss-cn-beijing.aliyuncs.com/DSW/Python/miniproject/01_draw_moon_cake/Alibaba-PuHuiTi-Medium.ttf
+from machine_learning.ml_classifier.classification import *
+from machine_learning.ml_regression.regression import *
+
+font_set = FontProperties(fname=r"../../Alibaba-PuHuiTi-Medium.ttf", size=12) ##可以自由下载字体使用
+
+
+# classification_model_methods = ['KNN', 'SVM', "LR", "CART", "RF", "GBDT", 'GNB', 'BNB' , 'AdaBoost']
+# classification_model_methods = ["CART", "RF", "GBDT", 'AdaBoost']
+
+classification_model_methods = ["CART","CARTTuning","RF","RFTuning","GBDT","GBDTTuning",'AdaBoost','AdaBoostTuning']
+
+regression_model_methods = ["Linear","Ridge","Lasso","Elastic","DecisionTree",'KNN','Bagging','RF','ExtraTree','ADA','GB'
+ # "SVR",
+ ]
+def classifier_selection(method, X_train, y_train, X_test):
+ """
+ 要执行的函数的名字
+ :param X_test:
+ :param y_train:
+ :param X_train:
+ :param method:
+ :return:
+ """
+ category = ClassifierCollection()
+ if hasattr(category, method): # 判断在模块中是否存在这个字符串
+ target_func = getattr(category, method) # 获取引用
+ return target_func(X_train, y_train, X_test) # 执行
+
+def regressor_selection(method, X_train, y_train, X_test):
+ """
+ 要执行的函数的名字
+ :param X_test:
+ :param y_train:
+ :param X_train:
+ :param method:
+ :return:
+ """
+ category = RegressorCollection()
+ if hasattr(category, method): # 判断在模块中是否存在这个字符串
+ target_func = getattr(category, method) # 获取引用
+ return target_func(X_train, y_train, X_test) # 执行
+
+def outputCLSScoreInConsole(accScoreDict, recallScoreDict, f1ScoreDict, precisionScoreDict):
+ """
+ 在控制台中输出分类的结果
+ :param accScoreDict acc 字典
+ :param recallScoreDict 召回字典
+ :param f1ScoreDict f1字典
+ :param precisionScoreDict 精确率字典
+ """
+ print("准确率 ",end='')
+ print(accScoreDict)
+
+ print("召回率 ",end='')
+ print(recallScoreDict)
+
+ print("F1 ",end='')
+ print(f1ScoreDict)
+
+ print("精准率 ",end='')
+ print(precisionScoreDict)
+
+
+def cls_ml_scores(X_train, y_train, X_test, y_test):
+ """
+ 使用了反射,可能较难理解,为不同的自定义分类方法,不同的数据填充方法,不同的分类方法打分
+ :param task 任务类型,可以从 cls 分类,rgs 回归,2 个当中选择,下面会有判断。
+ :param X_train:
+ :param y_train:
+ :param X_test:
+ :param y_test:
+ :return:
+ """
+
+ # https://blog.csdn.net/sinat_26917383/article/details/75199996
+ accScoreDict = {}
+ recallScoreDict = {}
+ f1ScoreDict = {}
+ precisionScoreDict = {}
+
+ # 使用反射对 8 种分类方法进行运行,并计算分数
+ for method in classification_model_methods:
+ print("Classfication method----->", method)
+ print("--" * 30)
+ print("X_train Shpae", X_train.shape)
+ print("X_test Shape", X_test.shape)
+ print("y_train Shape", y_train.shape)
+ print("y_test Shape", y_test.shape)
+ print("--"*30)
+
+ # 返回预测值和模型
+ _predicted, model = classifier_selection(method, X_train, y_train, X_test)
+
+
+ # 将模型保存起来
+ dump_model(model, 'cls_'+method)
+
+ # 各种分数
+ # https://blog.csdn.net/lyb3b3b/article/details/84819931
+ accScoreDict[method] = accuracy_score(y_test, _predicted) # 准确率,正确率(accuracy)
+ # ValueError: Target is multiclass but average='binary'.
+ # Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
+ recallScoreDict[method] = recall_score(y_test, _predicted, average='micro') # 召回率
+ f1ScoreDict[method] = f1_score(y_test, _predicted, average='micro') # F1
+ precisionScoreDict[method] = precision_score(y_test, _predicted, average='micro') # 精准率
+ # 各种分数
+
+ print("当前运行算法", method)
+ outputCLSScoreInConsole(accScoreDict, recallScoreDict, f1ScoreDict, precisionScoreDict)
+
+ # 各种分数
+ acc_scores = list()
+ recall_scores = list()
+ f1_scores = list()
+ precision_scores = list()
+ for method in classification_model_methods:
+ acc_scores.append(accScoreDict[method])
+ recall_scores.append(recallScoreDict[method])
+ f1_scores.append(f1ScoreDict[method])
+ precision_scores.append(precisionScoreDict[method])
+ # 各种分数
+
+ scoresDF = pd.DataFrame().from_dict({
+ 'method': classification_model_methods,
+ 'acc_score': acc_scores,
+ 'recall_score': recall_scores,
+ 'f1_score': f1_scores,
+ 'precision_score': precision_scores
+ })
+
+ scoresDF.sort_values('acc_score', inplace=True)
+
+ # 保存各种方法下的分类准确率,为之后的集成学习对比做准备。
+ scoresDF.to_csv(
+ './results-storage/classification_results/classfication.csv')
+
+ # https://jakevdp.github.io/PythonDataScienceHandbook/04.01-simple-line-plots.html
+ # Draw plot
+ import matplotlib.patches as patches
+ # import seaborn as sns
+ #
+ # plots = sns.barplot(x="method", y="score", data=scoresDF)
+ #
+ # # Iterating over the bars one-by-one
+ # for bar in plots.patches:
+ # # Using Matplotlib's annotate function and
+ # # passing the coordinates where the annotation shall be done
+ # plots.annotate(format(bar.get_height(), '.2f'),
+ # (bar.get_x() + bar.get_width() / 2, bar.get_height()),
+ # ha='center', va='center',
+ # size=15, xytext=(0, 5),
+ # textcoords='offset points')
+ plt.ylim(0, 1.2);
+ plt.plot(scoresDF['method'], scoresDF['acc_score'], color='blue', label='acc_score')
+ plt.plot(scoresDF['method'], scoresDF['recall_score'], color='g', label='recall_score')
+ plt.plot(scoresDF['method'], scoresDF['f1_score'], color='#FFDD44', label='f1_score')
+ plt.plot(scoresDF['method'], scoresDF['precision_score'], color='0.75', label='precision_score')
+ plt.xlabel("Method")
+ plt.ylabel("Score")
+
+ # Title, Label, Ticks and Ylim
+ plt.title('Bar Chart for cls' ,fontdict={'size': 22})
+
+ # Add patches to color the X axis labels
+ # p1 = patches.Rectangle((.57, -0.005), width=.33, height=.13, alpha=.1, facecolor='green', transform=fig.transFigure)
+ # p2 = patches.Rectangle((.124, -0.005), width=.446, height=.13, alpha=.1, facecolor='red', transform=fig.transFigure)
+ # fig.add_artist(p1)
+ # fig.add_artist(p2)
+ fileName = './results-storage/charts/score-barcharts/BarChartfor-cls' + ".png"
+ plt.savefig(fileName)
+ plt.show()
+
+ return fileName
+
+
+def plot_learn_curve(task,X_train, y_train, X_test):
+ """
+ 使用了反射,可能较难理解,为不同的自定义分类方法,不同的数据填充方法,不同的分类方法打分。
+ 学习曲线(learning curve)是一种用于评估机器学习算法表现的图表,它通过展示训练样本数量与算法性能的关系,
+ 帮助我们判断算法的过拟合和欠拟合情况。对于线性回归算法,学习曲线同样适用,它可以帮助我们找到最佳的模型参数。
+
+ 线性回归算法的学习曲线通常表现为:随着训练数据量的增加,训练误差和测试误差都会逐渐减小,且趋于稳定。当训练集
+ 误差和测试集误差之间的差距越来越小时,我们就可以认为模型已经学习到了最优解。
+ 因此,学习曲线对于线性回归算法是非常适用的,它可以帮助我们直观地观察模型的学习情况,并确定最佳的模型参数和训练集大小。
+ :param X_train:
+ :param y_train:
+ :param X_test
+ :return:
+ """
+
+ scoreDict = {}
+ methods = list()
+ if task =='cls':
+ methods = classification_model_methods
+ elif task =='rgs':
+ methods = regression_model_methods
+
+ # 使用反射对 8 种分类方法进行运行,并计算分数
+ for method in methods:
+ print("*" * 10, '方法', methods,
+ "--模型方法", method, " learn Curve", "*" * 10)
+ # 返回预测值和模型
+ if task == 'cls':
+ _predicted, model = classifier_selection(method, X_train, y_train, X_test)
+ elif task == 'rgs':
+ _predicted, model = regressor_selection(method, X_train, y_train, X_test)
+
+ plot_lc(X_train, y_train, model, method)
+
+
+def plot_lc(x, y, model, class_model):
+ """
+ :param x:
+ :param y:
+ :param model:
+ :param class_model: 模型
+ :return:
+ """
+ fig, ax = plt.subplots(1, 1, figsize=(6, 6)) # 设置画布和子图
+
+ train_sizes, train_scores, test_scores = learning_curve(model,x,y,cv=20,n_jobs=4)
+ # 设置分类器为随机森林,x,y,5折交叉验证,cpu同时运算为4个
+ ax.set_ylim((0.5, 1.1)) # 设置子图的纵坐标的范围为(0.7~1.1)
+ ax.set_xlabel("模型" + class_model) # 设置子图的x轴名称
+ ax.set_ylabel("score")
+ ax.grid() # 画出网图
+ ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color='r', label='train score')
+ # 画训练集数据分数,横坐标为用作训练的样本数,纵坐标为不同折下的训练分数的均值
+ ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', color='g', label='test score')
+ ax.legend(loc='best') # 设置图例
+ plt.savefig(
+ "./results-storage/charts/learn_curve/" + class_model + "_learn_curve.png")
+ plt.show()
+
+def rgs_ml_scores(X_train, y_train, X_test, y_test):
+ methods = regression_model_methods
+ rgs_results = {}
+
+ # 使用反射对 8 种分类方法进行运行,并计算分数
+ for method in methods:
+ print("Regression method----->", method)
+ print("--"*30)
+ print("X_train Shpae", X_train.shape)
+ print("X_test Shape", X_test.shape)
+ print("y_train Shape", y_train.shape)
+ print("y_test Shape", y_test.shape)
+ print("--" * 30)
+
+ # 返回预测值和模型
+ _predicted, model = regressor_selection(method, X_train, y_train, X_test)
+
+
+ # 将模型保存起来
+ dump_model(model, 'rgs_'+method)
+
+ result = lin_regplot(X_train,y_train,X_test,y_test,model)
+
+ rgs_results[method] = result
+
+ return rgs_results # 返回线性回归的结果
+
+def lin_regplot(X_train, y_train,X_test,y_test, model):
+
+ y_train_pred = model.predict(X_train)
+ y_test_pred = model.predict(X_test)
+ plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data')
+ plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', label='Test data')
+ # 预测值与偏差的关系
+ plt.xlabel('Predicted values')
+ plt.ylabel('Residuals')
+ plt.legend(loc='upper left')
+ plt.hlines(y = 0, xmin = -10, xmax = 50, lw = 2, color='red')
+ plt.xlim([-10, 50])
+ plt.tight_layout()
+ plt.show()
+
+
+ # 评价线性回归模型的常用指标有以下几种:
+ # 1. 均方误差(MSE):用于评估模型预测结果的误差大小,计算方法为平均预测值和真实值之差的平方和除以样本数量。MSE越小说明模型表现越好。
+ # 2. 决定系数(R2):用于评估模型对数据的拟合程度,其值介于0到1之间。R2越接近1说明模型对数据的拟合程度越好,越接近0说明模型对数据的拟合程度越差。
+ # 3. 均方根误差(RMSE):MSE的平方根,用于度量模型预测结果的标准差,即模型预测结果与真实值之间的平均差异。RMSE越小说明模型预测结果越准确。
+ # 4. 平均绝对误差(MAE):用于评估模型预测结果的误差大小,计算方法为平均预测值和真实值之差的绝对值之和除以样本数量。MAE越小说明模型表现越好。
+ # 通过对这些指标进行评价,可以对线性回归模型的表现和性能有一个较为全面的了解,进而对模型进行优化和改进。
+
+ # ![U5vnAA](https://oss.images.shujudaka.com/uPic/U5vnAA.png)
+ from sklearn.metrics import r2_score
+ from sklearn.metrics import mean_squared_error # 均方误差回归损失
+ from sklearn.metrics import mean_absolute_error #
+ # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
+
+ mse_y_train = mean_squared_error(y_train, y_train_pred)
+ mse_y_test = mean_squared_error(y_test, y_test_pred)
+
+ r2_score_y_train = r2_score(y_train, y_train_pred)
+ r2_socre_y_test = r2_score(y_test, y_test_pred)
+
+ mae_y_train = mean_absolute_error(y_train, y_train_pred)
+ mae_y_test = mean_absolute_error(y_test, y_test_pred)
+
+ print('MSE train: %.3f, test: %.3f' % (mse_y_train, mse_y_test))
+ print('R^2 train: %.3f, test: %.3f' % (r2_score_y_train,r2_socre_y_test ))
+ print('MAE train: %.3f, test: %.3f' % (mae_y_train,mae_y_test ))
+ print("\n\n")
+
+ return [mse_y_train,mse_y_test,r2_score_y_train,r2_socre_y_test,mae_y_train,mae_y_test]
+
+
+def outputRGSResults(rgs_result:dict):
+
+ tips = """
+# 评价线性回归模型的常用指标有以下几种:
+# 1. 均方误差(MSE):用于评估模型预测结果的误差大小,计算方法为平均预测值和真实值之差的平方和除以样本数量。MSE越小说明模型表现越好。
+# 2. 决定系数(R2):用于评估模型对数据的拟合程度,其值介于0到1之间。R2越接近1说明模型对数据的拟合程度越好,越接近0说明模型对数据的拟合程度越差。
+# 3. 均方根误差(RMSE):MSE的平方根,用于度量模型预测结果的标准差,即模型预测结果与真实值之间的平均差异。RMSE越小说明模型预测结果越准确。
+# 4. 平均绝对误差(MAE):用于评估模型预测结果的误差大小,计算方法为平均预测值和真实值之差的绝对值之和除以样本数量。MAE越小说明模型表现越好。
+# 通过对这些指标进行评价,可以对线性回归模型的表现和性能有一个较为全面的了解,进而对模型进行优化和改进。
+ """
+
+ print(tips)
+ pd.set_option('display.max_rows', 100)
+ pd.set_option('display.max_columns', 100)
+ pd.set_option('display.width', -1)
+ tempDF = pd.DataFrame.from_dict(rgs_result)
+
+ print(tempDF)
+
+
+ plt.subplots_adjust(wspace=1, hspace=1) # 调整子图间距
+ plt.figure(figsize=(24, 10))
+ plt.subplot(3,2,1)
+ plt.title('MSE-Y-Train')
+ plt.plot(tempDF.iloc[0])
+ plt.subplot(3, 2, 2)
+ plt.title('MSE-Y-Test')
+ plt.plot(tempDF.iloc[1])
+ plt.subplot(3, 2, 3)
+ plt.title('R^2-Y-Train')
+ plt.plot(tempDF.iloc[2])
+ plt.subplot(3, 2, 4)
+ plt.title('R^2-Y-Test')
+ plt.plot(tempDF.iloc[3])
+ plt.subplot(3, 2, 5)
+ plt.title('MAE-Y-Train')
+ plt.plot(tempDF.iloc[4])
+ plt.subplot(3, 2, 6)
+ plt.title('MAE-Y-Test')
+ plt.plot(tempDF.iloc[5])
+
+ plt.show()
+
+
+
diff --git a/machine_learning/model_dump_load.py b/machine_learning/model_dump_load.py
new file mode 100644
index 0000000..0fe641e
--- /dev/null
+++ b/machine_learning/model_dump_load.py
@@ -0,0 +1,30 @@
+# 导入包,无需pip install
+import pickle
+import joblib
+
+def dump_model(model, modelFileName):
+ """
+ 将模型保存起来
+ :param model:
+ :param modelFileName:
+ :return:
+ """
+ # 保存模型,我们想要导入的是模型本身,所以用&quot;wb&quot;方式写入,是二进制方式,DT是模型名字
+ pickle.dump(model, open("./models_dump/pickle_" + modelFileName,
+ "wb")) # open("dtr.dat","wb")意思是打开叫"dtr.dat"的文件,操作方式是写入二进制数据
+ # 保存模型
+ joblib.dump(model, './models_dump/joblib_' + modelFileName) # 第二个参数只需要写文件名字,是不是比pickle更人性化
+
+
+def load_model(modelFileName):
+ """
+ 根据模型模型名,加载保存的模型
+ :param modelFileName:
+ :return:pickle,joblib
+ """
+ # 加载模型
+ loaded_model = pickle.load(open("./models_dump/pickle_" + modelFileName, "rb"))
+ # 加载模型
+ loaded_model2 = joblib.load("./models_dump/joblib_" + modelFileName)
+
+ return loaded_model, loaded_model2
diff --git "a/pandas_util/Python346円225円260円346円215円256円347円211円271円345円276円201円345円210円206円346円236円2201円-345円210円206円345円270円203円345円210円206円346円236円220円357円274円210円346円236円201円345円267円256円357円274円214円351円242円221円347円216円207円347円233円264円346円226円271円345円233円276円347円255円211円357円274円211円.md" "b/pandas_util/Python346円225円260円346円215円256円347円211円271円345円276円201円345円210円206円346円236円2201円-345円210円206円345円270円203円345円210円206円346円236円220円357円274円210円346円236円201円345円267円256円357円274円214円351円242円221円347円216円207円347円233円264円346円226円271円345円233円276円347円255円211円357円274円211円.md"
new file mode 100644
index 0000000..a1f860b
--- /dev/null
+++ "b/pandas_util/Python346円225円260円346円215円256円347円211円271円345円276円201円345円210円206円346円236円2201円-345円210円206円345円270円203円345円210円206円346円236円220円357円274円210円346円236円201円345円267円256円357円274円214円351円242円221円347円216円207円347円233円264円346円226円271円345円233円276円347円255円211円357円274円211円.md"
@@ -0,0 +1,198 @@
+## 数据特征分析分为以下部分:
+
+1. 分布分析
+2. 对比分析
+3. 统计分析
+4. 帕累托分析
+5. 正态性检验
+6. 相关性分析
+
+## 数据:
+
+![pedxoR](https://oss.images.shujudaka.com/uPic/pedxoR.jpg)
+
+## 分布分析
+
+分布分析 --> 研究数据的分布特征和分布类型,分定量数据、定性数据
+
+主要是:极差、频率分布情况、分组组距及组数
+
+```python
+import numpy as np
+import pandas_util as pd
+import matplotlib.pyplot as plt
+import warnings
+
+warnings.filterwarnings('ignore')
+```
+
+```python
+#作散点图:横纵轴放经纬度,单价显示大小,总价显示颜色
+data = pd.read_csv('./datas/second_hand_ house.csv')
+data.head()
+# matplotlib.pyplot.scatter(x, y, s=20, c='b', marker='o', 
+# 	cmap=None, norm=None, vmin=None, vmax=None, alpha=None, 
+# 	linewidths=None, verts=None, hold=None, **kwargs)
+# x,y:表示的是shape大小为(n,)的数组,也就是我们即将绘制散点图的数据点,输入数据。
+# s:表示的是大小,是一个标量或者是一个shape大小为(n,)的数组,可选,默认20。
+# c:表示的是色彩或颜色序列,可选,默认蓝色’b’。但是c不应该是一个单一的RGB数字,也不应该是一个RGBA的序列,
+# 因为不便区分。c可以是一个RGB或RGBA二维行数组。
+# b---blue c---cyan g---green k---black
+# m---magenta r---red w---white y---byellow
+# marker:MarkerStyle,表示的是标记的样式,可选,默认’o’。
+# cmap:Colormap,标量或者是一个colormap的名字,cmap仅仅当c是一个浮点数数组的时候才使用。如果没有申明就是image.cmap,可选,默认None。
+# norm:Normalize,数据亮度在0-1之间,也是只有c是一个浮点数的数组的时候才使用。如果没有申明,就是默认None。
+# vmin,vmax:标量,当norm存在的时候忽略。用来进行亮度数据的归一化,可选,默认None。
+# alpha:标量,0-1之间,可选,默认None。
+# linewidths:也就是标记点的长度,默认None。
+
+
+plt.scatter(data['经度'],data['纬度'],
+ s = data['房屋单价']/500,
+ c = data['参考总价'],
+ alpha=0.4,
+ cmap = 'Reds')
+plt.grid()
+print(data.dtypes) #显示各列类型
+print('------\n数据长度%i条'%len(data)) #输出数据长度
+
+```
+
+
+![cvI5PY](https://oss.images.shujudaka.com/uPic/cvI5PY.jpg)
+
+## 极差–对定量字段
+
+```python
+#定义(可以求多列的极差)的函数
+def d_range(df,*cols):
+ krange = []
+ for col in cols:
+ crange = df[col].max()- df[col].min()
+ krange.append(crange)
+ return krange
+
+key1 = '参考首付'
+key2 = '参考总价'
+dr = d_range(data,key1,key2)
+print('%s的极差为%f \n%s的极差为%f'%(key1,dr[0],key2,dr[1]))
+```
+
+## 频率分布情况 - 对定量字段
+
+1.通过直方图直接判断分组组数
+
+```python
+#分组做柱状图
+data[key2].hist(bins=10)
+#简单查看数据分布,确定分布组数 → 一般8-16即可.这里分10组
+```
+
+![CYH7wX](https://oss.images.shujudaka.com/uPic/CYH7wX.jpg)
+
+## 2.求出分组区间
+
+pd.cut() 分箱
+
+pd.cut(x,bins,right=True,labels=None,retbins=False,precision=3,include_lowest=False,duplicates=‘raise’)
+x : 一维数组
+bins :整数,标量序列或者间隔索引,是进行分组的依据,
+如果填入整数n,则表示将x中的数值分成等宽的n份(即每一组内的最大值与最小值之差约相等);
+如果是标量序列,序列中的数值表示用来分档的分界值
+right :布尔值,默认为True表示包含最右侧的数值,即区间是左开右闭的
+
+**value_counts** 常用于数据表的计数及排序,计算每个不同值有在该列中的个数,同时还能根据需要进行排序。
+
+```python
+gcut = pd.cut(data[key2],10,right=False)
+gcut_count = gcut.value_counts(sort=False)
+#在这里不排序
+data['%s分组区间' % key2] = gcut.values
+#给原表多加一列,写每列数据在的区间
+print(gcut.head(),'\n------')
+print(gcut_count)
+data.head()
+
+```
+
+求出目标字段下频率分布的其他统计量 → 频数,频率,累计频率
+
+**pd.DataFrame()** 创建DataFrame格式
+
+DataFrame是Python中Pandas库中的一种数据结构,它类似excel,是一种二维表。
+
+```python
+r_zj = pd.DataFrame(gcut_count)
+r_zj.rename(columns ={gcut_count.name:'频数'}, inplace = True) # 修改频数字段名
+r_zj['频率'] = r_zj / r_zj['频数'].sum() # 计算频率
+r_zj['累计频率'] = r_zj['频率'].cumsum() # 计算累计频率
+r_zj['频率%'] = r_zj['频率'].apply(lambda x: "%.2f%%" % (x*100)) # 以百分比显示频率
+r_zj['累计频率%'] = r_zj['累计频率'].apply(lambda x: "%.2f%%" % (x*100)) # 以百分比显示累计频率
+r_zj.style.bar(subset=['频率','累计频率'], color='green',width=100)
+```
+从上表整理成如下表:
+
+![CIIAbN](https://oss.images.shujudaka.com/uPic/CIIAbN.jpg)
+
+
+## 绘制频率直方图
+
+```python
+r_zj['频率'].plot(kind = 'bar',width = 0.8,figsize = (12,2),rot=0,color = 'k',grid=True,alpha = 0.5)
+plt.title('参考总价分布频率直方图')
+
+x = len(r_zj)
+y = r_zj['频率']
+m = r_zj['频数']
+for i,j,k in zip(range(x),y,m):
+	plt.text(i-0.1,j+0.01,'%i'%k,color = 'k')
+#添加频数标签
+```
+
+![wALSIa](https://oss.images.shujudaka.com/uPic/wALSIa.jpg)
+
+
+## 频率分布情况 - 对定性字段
+
+### 1,通过计数统计判断不同类别的频率
+
+```python
+cx_g = data['朝向'].value_counts(sort=True)
+print(cx_g) # 统计频率,且排了序
+
+r_cx = pd.DataFrame(cx_g)
+r_cx.rename(columns ={cx_g.name:'频数'}, inplace = True) # 修改频数字段名
+r_cx['频率'] = r_cx / r_cx['频数'].sum() # 计算频率
+r_cx['累计频率'] = r_cx['频率'].cumsum() # 计算累计频率
+r_cx['频率%'] = r_cx['频率'].apply(lambda x: "%.2f%%" % (x*100)) # 以百分比显示频率
+r_cx['累计频率%'] = r_cx['累计频率'].apply(lambda x: "%.2f%%" % (x*100)) # 以百分比显示累计频率
+r_cx.style.bar(subset=['频率','累计频率'], color='#d65f5f',width=100)
+```
+
+![PHKbR5](https://oss.images.shujudaka.com/uPic/PHKbR5.jpg)
+
+### 2,绘制频率直方图、饼图
+
+```python
+plt.figure(num = 1,figsize = (12,2))
+r_cx['频率'].plot(kind = 'bar',
+ width = 0.8,
+ rot = 0,
+ color = 'k',
+ grid = True,
+ alpha = 0.5)
+plt.title('参考总价分布频率直方图')
+# 绘制直方图
+
+plt.figure(num = 2)
+plt.pie(r_cx['频数'],
+ labels = r_cx.index,
+ autopct='%.2f%%',
+ shadow = True)
+plt.axis('equal')
+# 绘制饼图
+```
+
+![uoLzXZ](https://oss.images.shujudaka.com/uPic/uoLzXZ.jpg)
+
+![W4VLLF](https://oss.images.shujudaka.com/uPic/W4VLLF.jpg)
\ No newline at end of file
diff --git a/pandas_util/__init__.py b/pandas_util/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pandas_util/datas/second_hand_ house.csv b/pandas_util/datas/second_hand_ house.csv
new file mode 100644
index 0000000..0f2c84e
--- /dev/null
+++ b/pandas_util/datas/second_hand_ house.csv	
@@ -0,0 +1,76 @@
+﻿房屋编码,小区,朝向,房屋单价,参考首付,参考总价,经度,纬度
+605093949,大望新平村,南北,5434,15,50,114.180964,22.603698
+605768856,通宝楼,南北,3472,7.5,25,114.179298,22.56691
+606815561,罗湖区罗芳村,南北,5842,15.6,52,114.158869,22.547223
+605147285,兴华苑,南北,3829,10.8,36,114.15804,22.554343
+606030866,京基东方都会,西南,47222,51,170,114.149243,22.55437
+605610283,水库新村,南北,5897,13.8,46,114.1454697,22.57018661
+601250774,水库新村,南北,8295,21.9,73,114.1454697,22.57018661
+605525982,水库新村,南北,6145,17.7,59,114.1454697,22.57018661
+606810540,新天地名居,南,51282,60,200,114.1407852,22.55086327
+599540811,翠岭苑,南北,11160,30,100,114.1373593,22.59192018
+606693036,松泉公寓,东,38557,60,200,114.1354523,22.58370781
+606348908,钻石时代,南,45833,49.5,165,114.135184,22.54832501
+605140018,东门E公馆,南,11891,31.5,105,114.134773,22.56072914
+606590991,美园,北,51923,40.5,135,114.1346817,22.54956245
+596462998,京基东方华都,南,62500,60,200,114.1343842,22.55811119
+594298847,京基东方华都,东,52631,60,200,114.1343842,22.55811119
+605560931,长丰苑,西北,38888,42,140,114.1342954,22.54779651
+596278133,长丰苑,南,43023,55.5,185,114.1342954,22.54779651
+605665139,金丽豪苑,东,95238,60,200,114.1338196,22.56984901
+604613670,金丽豪苑,南,80000,60,200,114.1338196,22.56984901
+606637625,愉天小区,北,66574,57.9,193,114.1337433,22.57199097
+606637625,愉天小区,北,66574,57.9,193,114.1337433,22.57199097
+606252043,雅园公寓,南北,8928,15,50,114.1337363,22.55559386
+602026329,雅园公寓,东南,5714,12,40,114.1337363,22.55559386
+602117545,东门168,东西,52173,36,120,114.1334229,22.55591774
+606660644,东门168,西南,61000,36.6,122,114.1334229,22.55591774
+599004917,阳光新干线家园,南,48725,58.47,194.9,114.1334152,22.54417419
+605769039,培峰苑,南北,3835,8.4,28,114.1322949,22.59471036
+605769102,培峰苑,南北,3958,11.4,38,114.1322949,22.59471036
+605769039,培峰苑,南北,3835,8.4,28,114.1322949,22.59471036
+605906134,金色都汇,东,48888,52.8,176,114.131928,22.546667
+604329044,缤纷时代家园,南,63879,49.5,165,114.1311035,22.55740738
+603204276,嘉湖新都,东南,89523,56.4,188,114.1310808,22.57252346
+606779885,嘉湖新都,南,64516,60,200,114.1310808,22.57252346
+605628024,嘉湖新都,南,66000,59.4,198,114.1310808,22.57252346
+604870821,湖润大厦,南北,5058,12.9,43,114.1304169,22.55123329
+605702262,湖润大厦,南北,4545,12,40,114.1304169,22.55123329
+590392825,东门天下,东南,55714,58.5,195,114.1286697,22.55532455
+603513631,田贝花园,东南,9911,27,90,114.1281815,22.57090759
+606616471,田贝花园,南北,9693,28.5,95,114.1281815,22.57090759
+606616471,田贝花园,南北,9693,28.5,95,114.1281815,22.57090759
+598334198,田贝花园,南北,8363,13.8,46,114.1281815,22.57090759
+599340816,田贝花园,南北,9552,26.7,89,114.1281815,22.57090759
+599044788,银座金钻,南,41666,52.5,175,114.128142,22.547827
+604872669,置地逸轩,北,50000,58.5,195,114.1273651,22.54327393
+606625129,置地逸轩,南,50000,58.5,195,114.1273651,22.54327393
+601093683,罗湖村,西北,5113,13.5,45,114.125588,22.541119
+604870556,罗湖村,西北,4772,12.6,42,114.125588,22.541119
+606482810,罗湖村,南北,4545,13.5,45,114.125588,22.541119
+606355577,罗湖村,南北,5842,15.6,52,114.125588,22.541119
+601897164,海丰苑,西南,38000,57,190,114.1245873,22.54687355
+605532790,罗湖1号大楼,南北,4777,12.9,43,114.123972,22.546023
+605279416,罗湖1号大楼,南北,6588,16.8,56,114.123972,22.546023
+606729036,友谊大厦,南,36923,57.6,192,114.1237106,22.54456711
+597559191,金田大厦,南北,5582,15.24,50.8,114.1215574,22.54521646
+606578375,虹桥星座,东,47878,47.4,158,114.1205521,22.5762043
+600682443,虹桥星座,东,47878,47.4,158,114.1205521,22.5762043
+601845711,田心村,南北,5370,17.4,58,114.119935,22.573407
+598296969,时尚新居,南,41818,41.4,138,114.1188431,22.5744648
+603983611,祥福雅居,南,48387,45,150,114.1188049,22.57196808
+606543980,西湖大厦,东,30411,51,170,114.1164398,22.56119537
+606535099,风格名苑,东,55806,51.9,173,114.1159592,22.55708122
+595068607,幸福华府,南北,26388,57,190,114.1143646,22.5544281
+606799436,武警七支队大院,南北,3960,12,40,114.111282,22.557374
+594102300,新闻大厦,南,6052,13.8,46,114.1093938,22.54749824
+606719083,星湖花园三期,西南,41578,47.4,158,114.1058044,22.57372665
+603105329,武警家属大院,东南,4444,12,40,114.1005318,22.57568425
+605322083,恒通花园,南北,4245,13.5,45,114.097673,22.570293
+605244548,恒通花园,南,4128,9.66,32.2,114.097673,22.570293
+601116785,恒通花园,南北,3773,12,40,114.097673,22.570293
+598258845,三九花园,南,5833,12.6,42,114.0895386,22.57707977
+594221866,三九花园,南,5681,15,50,114.0895386,22.57707977
+606700179,城市春天,南北,3571,7.5,25,114.083405,22.5395049
+603950517,皇御苑,东北,59701,54,180,114.0817954,22.53139307
+605232094,晨晖家园,南,54285,57,190,114.0676249,22.52550815
diff --git a/pandas_util/feature_analytis.py b/pandas_util/feature_analytis.py
new file mode 100644
index 0000000..aee7ee4
--- /dev/null
+++ b/pandas_util/feature_analytis.py
@@ -0,0 +1,439 @@
+# 特征分析
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import warnings
+import seaborn as sns
+from scipy import stats
+
+warnings.filterwarnings('ignore')
+from chinese_calendar import is_workday, is_holiday
+
+
+def feature_distribution(datas, columnName):
+ """
+ 显示DataFrame 中一列的分布情况
+ :param datas DataFrame
+ :param columnName 列名
+ """
+
+ columnOfDataFrame = datas[columnName]
+ print("########################## Column " + columnName + '##########################')
+ print('\n\n描述性统计信息,你也可以自己使用下面的方法来做探索')
+ print("""
+编号 函数 描述
+1 count() 非空观测数量
+2 sum() 所有值之和
+3 mean() 所有值的平均值
+4 median() 所有值的中位数
+5 mode() 值的模值
+6 std() 值的标准偏差
+7 min() 所有值中的最小值
+8 max() 所有值中的最大值
+9 abs() 绝对值
+10 prod() 数组元素的乘积
+11 cumsum() 累计总和
+12 cumprod() 累计乘积
+ """)
+ print("---" + columnName + " 描述性信息统计---")
+ print(columnOfDataFrame.describe(include='all'))
+
+ print('\n\n' + columnName + " 列中的唯一值和数量如下:\n")
+ print(datas[columnName].value_counts())
+
+ # 散点分布
+ plt.title('Scatter')
+ plt.scatter(np.arange(len(columnOfDataFrame)), columnOfDataFrame, alpha=0.4, cmap='Reds')
+ plt.grid()
+ plt.show()
+ # 散点分布
+ plt.title("Hist")
+ sns.distplot(columnOfDataFrame)
+ plt.show()
+
+ """
+ kstest方法:KS检验,参数分别是:待检验的数据,检验方法(这里设置成norm正态分布),均值与标准差
+ 结果返回两个值:statistic → D值,pvalue → P值
+ p值大于0.05,为正态分布
+ H0:样本符合 
+ H1:样本不符合 
+ 如何p>0.05接受H0 ,反之 
+ """
+
+ print("""
+kstest方法:KS检验,
+
+参数分别是:待检验的数据,检验方法(这里设置成norm正态分布),均值与标准差
+
+结果返回两个值:statistic → D值,pvalue → P值
+
+p 值大于0.05,为正态分布 H0:样本符合 H1:样本不符合 如何 p>0.05 接受H0 ,反之 
+ """)
+ try:
+ u = columnOfDataFrame.mean()
+ std = columnOfDataFrame.std()
+ result = stats.kstest(columnOfDataFrame, 'norm', (u, std))
+ print(result)
+
+ print("变量极差", end='\t')
+ print("Max(%f)-Min(%f) = %f" % (
+ columnOfDataFrame.max(), columnOfDataFrame.min(), columnOfDataFrame.max() - columnOfDataFrame.min()))
+ except Exception as e:
+ print(e)
+ pass
+ print("---频率分布情况---")
+ plt.title("Frequency Distribution Bin10")
+ columnOfDataFrame.hist(bins=10)
+ plt.show()
+ plt.title("Frequency Distribution Bin50")
+ columnOfDataFrame.hist(bins=50)
+ plt.show()
+
+ print("---分组区间---")
+ gcut = pd.cut(columnOfDataFrame, 10, right=False)
+ gcut_count = gcut.value_counts(sort=False)
+ # 在这里不排序
+ # columnOfDataFrame['分组区间'] = gcut.values
+ # 给原表多加一列,写每列数据在的区间
+ print(gcut.head(), '\n------')
+ print(gcut_count)
+ print(columnOfDataFrame.head())
+
+ r_zj = pd.DataFrame(gcut_count)
+ r_zj.rename(columns={gcut_count.name: '频数'}, inplace=True) # 修改频数字段名
+ r_zj['频率'] = r_zj / r_zj['频数'].sum() # 计算频率
+ r_zj['累计频率'] = r_zj['频率'].cumsum() # 计算累计频率
+ r_zj['频率%'] = r_zj['频率'].apply(lambda x: "%.2f%%" % (x * 100)) # 以百分比显示频率
+ r_zj['累计频率%'] = r_zj['累计频率'].apply(lambda x: "%.2f%%" % (x * 100)) # 以百分比显示累计频率
+ r_zj.style.bar(subset=['频率', '累计频率'], color='green', width=100)
+ # pd.set_option("max_columns", None) # Showing only two columns
+ # pd.set_option("max_rows", None)
+ print("---输出频*表---")
+ print(r_zj)
+
+ r_zj['频率'].plot(kind='bar', width=0.8, figsize=(12, 2), rot=0, color='k', grid=True, alpha=0.5)
+ plt.title('Distribution Hist')
+ x = len(r_zj)
+ y = r_zj['频率']
+ m = r_zj['频数']
+ for i, j, k in zip(range(x), y, m):
+ plt.text(i - 0.1, j + 0.01, '%i' % k, color='k')
+ # 添加频数标签
+ plt.show()
+
+ plt.pie(r_zj['频数'],
+ labels=r_zj.index,
+ autopct='%.2f%%',
+ shadow=True)
+ plt.axis('equal')
+ plt.show()
+
+ print("---箱线图---")
+ print("""
+简单直观的异常值检测方法:箱形图(箱线图)
+箱形图中,从上到下依次有 6 个数据节点,分别是上界、上四分位、均值、中位数、下四分位、下界。而那些超过上界的值就会被标记为离群点,也就是异常数据。
+ """)
+ not_null = pd.to_numeric(columnOfDataFrame, errors='coerce')
+ print(not_null)
+ plt.boxplot(not_null)
+ plt.show()
+
+ print('\n\n')
+
+
+def plot_scatter(datas, colX, colY, colHue):
+ """
+ 显示散点图
+ :param datas dataframe
+ :param colX X 轴列,列名
+ :param colY Y 轴列,列名
+ :param colHue 数据显示列,字符串类型
+ """
+ import seaborn as sns
+ import matplotlib.pyplot as plt
+ sns.lmplot(x=colX, y=colY,
+ data=datas,
+ hue=colHue,
+ fit_reg=False)
+ plt.xlabel(colX)
+ plt.ylabel(colY)
+ plt.title(colHue + 'Scatter Plot for ' + colX + " & " + colY)
+ plt.show()
+
+
+def count_unique(datas, cols):
+ for col in cols:
+ print('\n\n' + col + " 列中的唯一值和数量如下:\n")
+ print(datas[col].value_counts())
+
+
+# plot_bars(auto_prices, ['fuel_type'])
+# plot_cols = ['make', 'body_style', 'num_of_cylinders']
+# plot_bars(auto_prices, plot_cols)
+def plot_bars(datas, cols):
+ for col in cols:
+ fig = plt.figure(figsize=(6, 6)) # 定义绘图区域
+ ax = fig.gca() # 定义轴axis
+ counts = datas[col].value_counts() # 找到每个唯一类别的计数
+ counts.plot.bar(ax=ax, color='blue') # 在计数数据框上使用 plot.bar 方法
+ ax.set_title('Number of by' + col) # 给一个主标题
+ ax.set_xlabel(col) # 设置 x 轴的文本
+ ax.set_ylabel('Numbers') # 为 y 轴设置文本
+ plt.show()
+
+
+# num_cols = ['curb_weight', 'engine_size', 'city_mpg', 'price']
+# plot_histogram(auto_prices, num_cols)
+def plot_histogram(datas, cols, bins=10):
+ for col in cols:
+ fig = plt.figure(figsize=(6, 6)) # define plot area
+ ax = fig.gca() # define axis
+ datas[col].plot.hist(ax=ax, bins=bins) # Use the plot.hist method on subset of the data frame
+ ax.set_title('Histogram of ' + col) # Give the plot a main title
+ ax.set_xlabel(col) # Set text for the x axis
+ ax.set_ylabel('Numbers') # Set text for y axis
+ plt.show()
+
+
+# plot_density_hist(auto_prices, num_cols, bins = 20, hist = True)
+def plot_density_hist(datas, cols, bins=10, hist=False):
+ for col in cols:
+ sns.set_style("whitegrid")
+ sns.distplot(datas[col], bins=bins, rug=True, hist=hist)
+ plt.title('Histogram of ' + col) # Give the plot a main title
+ plt.xlabel(col) # Set text for the x axis
+ plt.ylabel('Numbers') # Set text for y axis
+ plt.show()
+
+
+# plot_scatter(auto_prices, ['horsepower'], 'engine_size')
+# num_cols = ['curb_weight', 'engine_size', 'horsepower', 'city_mpg']
+# plot_scatter(auto_prices, num_cols)
+def plot_scatter(datas, cols, col_y='price'):
+ for col in cols:
+ fig = plt.figure(figsize=(7, 6)) # define plot area
+ ax = fig.gca() # define axis
+ datas.plot.scatter(x=col, y=col_y, ax=ax)
+ ax.set_title('Scatter plot of ' + col_y + ' vs. ' + col) # Give the plot a main title
+ ax.set_xlabel(col) # Set text for the x axis
+ ax.set_ylabel(col_y) # Set text for y axis
+ plt.show()
+
+
+# plot_desity_2d(auto_prices, num_cols)
+# plot_desity_2d(auto_prices, num_cols, kind = 'hex')
+def plot_desity_2d(datas, cols, col_y='price', kind='kde'):
+ for col in cols:
+ sns.set_style("whitegrid")
+ sns.jointplot(col, col_y, data=datas, kind=kind)
+ plt.xlabel(col) # Set text for the x axis
+ plt.ylabel(col_y) # Set text for y axis
+ plt.show()
+
+
+# cat_cols = ['fuel_type', 'aspiration', 'num_of_doors', 'body_style',
+# 'drive_wheels', 'engine_location', 'engine_type', 'num_of_cylinders']
+# plot_box(auto_prices, cat_cols)
+def plot_box(datas, cols, col_y='price'):
+ for col in cols:
+ sns.set_style("whitegrid")
+ sns.boxplot(col, col_y, data=datas)
+ plt.xlabel(col) # Set text for the x axis
+ plt.ylabel(col_y) # Set text for y axis
+ plt.show()
+
+
+# plot_violin(auto_prices, cat_cols)
+def plot_violin(datas, cols, col_y='price'):
+ for col in cols:
+ sns.set_style("whitegrid")
+ sns.violinplot(col, col_y, data=datas)
+ plt.xlabel(col) # Set text for the x axis
+ plt.ylabel(col_y) # Set text for y axis
+ plt.show()
+
+
+# num_cols = ['curb_weight', 'engine_size', 'horsepower', 'city_mpg']
+# plot_scatter_shape(auto_prices, num_cols)
+def plot_scatter_shape(datas, cols, shape_col='fuel_type', col_y='price', alpha=0.2):
+ shapes = ['+', 'o', 's', 'x', '^'] # pick distinctive shapes
+ unique_cats = datas[shape_col].unique()
+ for col in cols: # loop over the columns to plot
+ sns.set_style("whitegrid")
+ for i, cat in enumerate(unique_cats): # loop over the unique categories
+ temp = datas[datas[shape_col] == cat]
+ sns.regplot(col, col_y, data=temp, marker=shapes[i], label=cat,
+ scatter_kws={"alpha": alpha}, fit_reg=False, color='blue')
+ plt.title('Scatter plot of ' + col_y + ' vs. ' + col) # Give the plot a main title
+ plt.xlabel(col) # Set text for the x axis
+ plt.ylabel(col_y) # Set text for y axis
+ plt.legend()
+ plt.show()
+
+
+# num_cols = ['engine_size', 'horsepower', 'city_mpg']
+# plot_scatter_size(auto_prices, num_cols)
+def plot_scatter_size(datas, cols, shape_col='fuel_type', size_col='curb_weight',
+ size_mul=0.000025, col_y='price', alpha=0.2):
+ shapes = ['+', 'o', 's', 'x', '^'] # pick distinctive shapes
+ unique_cats = datas[shape_col].unique()
+ for col in cols: # loop over the columns to plot
+ sns.set_style("whitegrid")
+ for i, cat in enumerate(unique_cats): # loop over the unique categories
+ temp = datas[datas[shape_col] == cat]
+ sns.regplot(col, col_y, data=temp, marker=shapes[i], label=cat,
+ scatter_kws={"alpha": alpha, "s": size_mul * temp[size_col] ** 2},
+ fit_reg=False, color='blue')
+ plt.title('Scatter plot of ' + col_y + ' vs. ' + col) # Give the plot a main title
+ plt.xlabel(col) # Set text for the x axis
+ plt.ylabel(col_y) # Set text for y axis
+ plt.legend()
+ plt.show()
+
+
+# num_cols = ['engine_size', 'horsepower', 'city_mpg']
+# plot_scatter_shape_size_col(auto_prices, num_cols)
+def plot_scatter_shape_size_col(datas, cols, shape_col='fuel_type', size_col='curb_weight',
+ size_mul=0.000025, color_col='aspiration', col_y='price', alpha=0.2):
+ shapes = ['+', 'o', 's', 'x', '^'] # pick distinctive shapes
+ colors = ['green', 'blue', 'orange', 'magenta', 'gray'] # specify distinctive colors
+ unique_cats = datas[shape_col].unique()
+ unique_colors = datas[color_col].unique()
+ for col in cols: # loop over the columns to plot
+ sns.set_style("whitegrid")
+ for i, cat in enumerate(unique_cats): # loop over the unique categories
+ for j, color in enumerate(unique_colors):
+ temp = datas[(datas[shape_col] == cat) & (datas[color_col] == color)]
+ sns.regplot(col, col_y, data=temp, marker=shapes[i],
+ scatter_kws={"alpha": alpha, "s": size_mul * temp[size_col] ** 2},
+ label=(cat + ' and ' + color), fit_reg=False, color=colors[j])
+ plt.title('Scatter plot of ' + col_y + ' vs. ' + col) # Give the plot a main title
+ plt.xlabel(col) # Set text for the x axis
+ plt.ylabel(col_y) # Set text for y axis
+ plt.legend()
+ plt.show()
+
+
+# plot_violin_hue(auto_prices, cat_cols)
+def plot_violin_hue(datas, cols, col_y='price', hue_col='aspiration'):
+ for col in cols:
+ sns.set_style("whitegrid")
+ sns.violinplot(col, col_y, data=datas, hue=hue_col, split=True)
+ plt.xlabel(col) # Set text for the x axis
+ plt.ylabel(col_y) # Set text for y axis
+ plt.show()
+
+
+# num_cols = ["curb_weight", "engine_size", "horsepower", "city_mpg", "price", "fuel_type"]
+def plot_scatter_pairplot(datas, num_cols):
+ sns.pairplot(datas[num_cols],
+ hue='fuel_type',
+ palette="Set2",
+ diag_kind="kde",
+ size=2).map_upper(sns.kdeplot, cmap="Blues_d")
+
+
+## Define columns for making a conditioned histogram
+# plot_cols2 = ["length",
+# "curb_weight",
+# "engine_size",
+# "city_mpg",
+# "price"]
+#
+# cond_hists(auto_prices, plot_cols2, 'drive_wheels')
+## Function to plot conditioned histograms
+def cond_hists(df, plot_cols, grid_col):
+ import matplotlib.pyplot as plt
+ import seaborn as sns
+ ## Loop over the list of columns
+ for col in plot_cols:
+ grid1 = sns.FacetGrid(df, col=grid_col)
+ grid1.map(plt.hist, col, alpha=.7)
+ return grid_col
+
+
+def sigma3(x):
+ '''
+ MBA智库对3σ原则的描述:
+
+ σ代表标准差,μ代表均值
+
+ 样本数据服从正态分布的情况下
+
+ 数值分布在(μ-σ,μ+σ)中的概率为0.6826
+
+ 数值分布在(μ-2σ,μ+2σ)中的概率为0.9544
+
+ 数值分布在(μ-3σ,μ+3σ)中的概率为0.9974
+
+ 可以认为,Y 的取值几乎全部集中在(μ-3σ,μ+3σ)区间内,超出这个范围的可能性仅占不到0.3%。
+
+ https://www.guofei.site/2017/10/19/cleandata.html
+ '''
+ x = pd.Series(x)
+ mean_ = x.mean()
+ std_ = x.std()
+ rules = (mean_ - 3 * std_> x) &#124; (mean_ + 3 * std_ < x) + indx = x[rules].index + # 获取异常值 + # out = x[indx] + return indx + + +def eda_profile(data): + """ + ydata_profile + """ + from ydata_profiling import ProfileReport + profile = ProfileReport(data, title="Profiling Report") + profile.to_file("data_analysis.html") + + +def eda_pgw(data): + import pygwalker as pyg + gwalker = pyg.walk(data) + + +## 日期相关操作 +def month_stage(x): + if x in range(1, 11): + return 0 # 上旬 + elif x in range(11, 21): + return 1 # 中旬 + else: + return 2 # 下旬 + + +# time +def time_feature(data, col): + """ + 对日期类型的列进行处理,分割出来更多的字段、特征、列 + """ + data['order_date'] = pd.to_datetime(data[col]) + data['dayofmonth'] = data[col].dt.day + data['dayofweek'] = data[col].dt.dayofweek + data['month'] = data[col].dt.month + data['year'] = data[col].dt.year + data['is_month_start'] = (data[col].dt.is_month_start).astype(int) + data['is_month_end'] = (data[col].dt.is_month_end).astype(int) + data['is_workday'] = (data[col].apply(lambda x: is_workday(x))).astype(int) + data['is_holiday'] = (data[col].apply(lambda x: is_holiday(x))).astype(int) + data['in_quarter'] = data[col].dt.quarter + data['in_month_stage'] = data['dayofmonth'].apply(month_stage) + return data + + +## + +def segments_bins_labels(datas, col, bins, labels): + """ + 按段为原来的 DataFrame 增加新的字段 + :param datas DataFrame + :param col 要处理的列 + :param bins 分割规则 + :param lables 分割后给的描述信息 + """ + + segments = pd.cut(datas[col], bins, labels) + datas['segements_' + col] = segments + return datas diff --git a/pandas_util/learning_curve.py b/pandas_util/learning_curve.py new file mode 100644 index 0000000..df3ee48 --- /dev/null +++ b/pandas_util/learning_curve.py @@ -0,0 +1,60 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.model_selection import learning_curve +import matplotlib.pyplot as plt +import matplotlib as mpl +zhfont = mpl.font_manager.FontProperties(fname='/usr/share/fonts/truetype/liberation/simhei.ttf') +# 用sklearn的learning_curve得到training_score和cv_score,使用matplotlib画出learning curve + +def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, + train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True,zhfont = zhfont): + """ + 画出data在某模型上的learning curve. + 参数解释 + ---------- + estimator : 你用的分类器。 + title : 表格的标题。 + X : 输入的feature,numpy类型 + y : 输入的target vector + ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点 + cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份) + n_jobs : 并行的的任务数(默认1) + """ + train_sizes, train_scores, test_scores = learning_curve( + estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) + + train_scores_mean = np.mean(train_scores, axis=1) + train_scores_std = np.std(train_scores, axis=1) + test_scores_mean = np.mean(test_scores, axis=1) + test_scores_std = np.std(test_scores, axis=1) + + if plot: + plt.figure() + plt.title(title, fontproperties=zhfont) + if ylim is not None: + plt.ylim(*ylim) + plt.xlabel(u"训练样本数", fontproperties=zhfont) + plt.ylabel(u"得分", fontproperties=zhfont) + plt.gca().invert_yaxis() + plt.grid() + + plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, + alpha=0.1, color="b") + plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, + alpha=0.1, color="r") + plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分") + plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分") + + plt.legend(loc="best",prop=zhfont) + + # plt.draw() + # plt.show() + plt.gca().invert_yaxis() + + midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 + diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1]) + + if diff < 0.05: + print("目前 两条曲线的 Diff = ",diff," 看起来还不错!") + + return midpoint, diff \ No newline at end of file diff --git a/snowflake-gui/__init__.py b/snowflake-gui/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/snowflake-gui/exceptions.py b/snowflake-gui/exceptions.py new file mode 100644 index 0000000..5091380 --- /dev/null +++ b/snowflake-gui/exceptions.py @@ -0,0 +1,6 @@ +class InvalidSystemClock(Exception): + """ + 时钟回拨异常 + """ + pass + diff --git a/snowflake-gui/snowflake.py b/snowflake-gui/snowflake.py new file mode 100644 index 0000000..6e0bdfa --- /dev/null +++ b/snowflake-gui/snowflake.py @@ -0,0 +1,109 @@ +# https://www.cnblogs.com/oklizz/p/11865750.html + +# Twitter's Snowflake algorithm implementation which is used to generate distributed IDs. +# https://github.com/twitter-archive/snowflake/blob/snowflake-2010/src/main/scala/com/twitter/service/snowflake/IdWorker.scala + +import time +import logging + +from exceptions import InvalidSystemClock + + +# 64位ID的划分 +WORKER_ID_BITS = 5 +DATACENTER_ID_BITS = 5 +SEQUENCE_BITS = 12 + +# 最大取值计算 +MAX_WORKER_ID = -1 ^ (-1 << WORKER_ID_BITS) # 2**5-1 0b11111 +MAX_DATACENTER_ID = -1 ^ (-1 << DATACENTER_ID_BITS) + +# 移位偏移计算 +WOKER_ID_SHIFT = SEQUENCE_BITS +DATACENTER_ID_SHIFT = SEQUENCE_BITS + WORKER_ID_BITS +TIMESTAMP_LEFT_SHIFT = SEQUENCE_BITS + WORKER_ID_BITS + DATACENTER_ID_BITS + +# 序号循环掩码 +SEQUENCE_MASK = -1 ^ (-1 << SEQUENCE_BITS) + +# Twitter元年时间戳 +TWEPOCH = 1288834974657 + + +logger = logging.getLogger('flask.app') + + +class IdWorker(object): + """ + 用于生成IDs + """ + + def __init__(self, datacenter_id, worker_id, sequence=0): + """ + 初始化 + :param datacenter_id: 数据中心(机器区域)ID + :param worker_id: 机器ID + :param sequence: 其实序号 + """ + # sanity check + if worker_id> MAX_WORKER_ID or worker_id < 0: + raise ValueError('worker_id值越界') + + if datacenter_id> MAX_DATACENTER_ID or datacenter_id < 0:
+ raise ValueError('datacenter_id值越界')
+
+ self.worker_id = worker_id
+ self.datacenter_id = datacenter_id
+ self.sequence = sequence
+
+ self.last_timestamp = -1 # 上次计算的时间戳
+
+ def _gen_timestamp(self):
+ """
+ 生成整数时间戳
+ :return:int timestamp
+ """
+ return int(time.time() * 1000)
+
+ def get_id(self):
+ """
+ 获取新ID
+ :return:
+ """
+ timestamp = self._gen_timestamp()
+
+ # 时钟回拨
+ if timestamp < self.last_timestamp:
+ logging.error('clock is moving backwards. Rejecting requests until {}'.format(self.last_timestamp))
+ raise InvalidSystemClock
+
+ if timestamp == self.last_timestamp:
+ self.sequence = (self.sequence + 1) & SEQUENCE_MASK
+ if self.sequence == 0:
+ timestamp = self._til_next_millis(self.last_timestamp)
+ else:
+ self.sequence = 0
+
+ self.last_timestamp = timestamp
+
+ new_id = ((timestamp - TWEPOCH) << TIMESTAMP_LEFT_SHIFT) &#124; (self.datacenter_id << DATACENTER_ID_SHIFT) &#124; \
+ (self.worker_id << WOKER_ID_SHIFT) &#124; self.sequence
+ return new_id
+
+ def _til_next_millis(self, last_timestamp):
+ """
+ 等到下一毫秒
+ """
+ timestamp = self._gen_timestamp()
+ while timestamp <= last_timestamp:
+ timestamp = self._gen_timestamp()
+ return timestamp
+
+
+if __name__ == '__main__':
+ worker = IdWorker(1, 2, 0)
+ print(worker.get_id())
+
+ for i in range(10):
+ worker = IdWorker(1, 2, 0)
+ print(worker.get_id())
diff --git a/tests/pandas_test.py b/tests/pandas_test.py
new file mode 100644
index 0000000..cc374d9
--- /dev/null
+++ b/tests/pandas_test.py
@@ -0,0 +1,12 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+from pandas_util.feature_analytis import *
+
+datas = pd.read_csv('../pandas_util/datas/second_hand_ house.csv')
+# print(datas)
+
+housePrice = datas['房屋单价']
+
+feature_distribution(housePrice)
\ No newline at end of file
</div><div class="naked_ctrl">
<form action="/index.cgi/contrast" method="get" name="gate">
<p><a href="http://altstyle.alfasado.net">AltStyle</a> によって変換されたページ <a href="https://github.com/LicyCommunication/PythonUtils/compare/main...WangLaoShi:PythonUtils:main.diff">(-&gt;オリジナル)</a>
/ <label>アドレス: <input type="text" name="naked_post_url" value="https://github.com/LicyCommunication/PythonUtils/compare/main...WangLaoShi:PythonUtils:main.diff" size="22" /></label> <label>モード: <select name="naked_post_mode">
<option value="default">デフォルト</option>
<option value="speech">音声ブラウザ</option>
<option value="ruby">ルビ付き</option>
<option value="contrast" selected="selected">配色反転</option>
<option value="larger-text">文字拡大</option>
<option value="mobile">モバイル</option>
</select>
<input type="submit" value="表示" />
</p>
</form>
</div>