diff --git a/Alibaba-PuHuiTi-Medium.ttf b/Alibaba-PuHuiTi-Medium.ttf new file mode 100644 index 0000000..7dc8134 Binary files /dev/null and b/Alibaba-PuHuiTi-Medium.ttf differ diff --git a/README.md b/README.md index c0e828a..8526d25 100755 --- a/README.md +++ b/README.md @@ -12,4 +12,12 @@ Python当中的使用的一些模块、类和方法 4. TK-Plots 一个使用 TK 做的公式图打印的程序 5. notebook 在 JupyterNotebook 中可以直接使用的一些操作 6. plt 对于 Matplotlib 的 plt 的一些常用封装 -7. img 关于 Image 的一些操作,主要是 PIL 中的 \ No newline at end of file +7. img 关于 Image 的一些操作,主要是 PIL 中的 + +## JupyterNotebook 中使用主题 + +```shell +pip install jupyterthemes --user +jt -l +jt -t grade3 -fs 95 -altp -tfs 11 -nfs 115 -cellw 95% -T +``` \ No newline at end of file diff --git a/datetime/__init__.py b/datetime/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datetime/dt.py b/datetime/dt.py new file mode 100644 index 0000000..f11f144 --- /dev/null +++ b/datetime/dt.py @@ -0,0 +1,31 @@ +import time,datetime +def getTime(): + ########## 加入时间获取 + # 今天日期 + today = datetime.date.today() + # 昨天时间 + yesterday = today - datetime.timedelta(days=1) + + # 明天时间 + tomorrow = today + datetime.timedelta(days=1) + acquire = today + datetime.timedelta(days=2) + + # 昨天开始时间戳 + yesterday_start_time = int(time.mktime(time.strptime(str(yesterday), '%Y-%m-%d'))) + # 昨天结束时间戳 + yesterday_end_time = int(time.mktime(time.strptime(str(today), '%Y-%m-%d'))) - 1 + + # 今天开始时间戳 + today_start_time = yesterday_end_time + 1 + # 今天结束时间戳 + today_end_time = int(time.mktime(time.strptime(str(tomorrow), '%Y-%m-%d'))) - 1 + + # 明天开始时间戳 + tomorrow_start_time = int(time.mktime(time.strptime(str(tomorrow), '%Y-%m-%d'))) + # 明天结束时间戳 + tomorrow_end_time = int(time.mktime(time.strptime(str(acquire), '%Y-%m-%d'))) - 1 + ########## 加入时间获取 + + return today,yesterday,tomorrow,acquire,\ + today_start_time,today_end_time,yesterday_start_time,yesterday_end_time,\ + tomorrow_start_time,tomorrow_end_time \ No newline at end of file diff --git a/demos/Hello.py b/demos/Hello.py new file mode 100644 index 0000000..4287ca8 --- /dev/null +++ b/demos/Hello.py @@ -0,0 +1 @@ +# \ No newline at end of file diff --git a/demos/__init__.py b/demos/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/log/__init__.py b/log/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/log/graylog.py b/log/graylog.py new file mode 100644 index 0000000..bbdb72b --- /dev/null +++ b/log/graylog.py @@ -0,0 +1,10 @@ +import logging +import graypy +def log2gray(content,serverIP,ports =12201,inputName='GrayLogMadeByPython'): + my_logger = logging.getLogger('test_logger') + my_logger.setLevel(logging.DEBUG) + + handler = graypy.GELFUDPHandler(serverIP, ports, localname=inputName) + my_logger.addHandler(handler) + + my_logger.debug(content) \ No newline at end of file diff --git a/machine_learning/__init__.py b/machine_learning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/machine_learning/demo.py b/machine_learning/demo.py new file mode 100644 index 0000000..65fca9f --- /dev/null +++ b/machine_learning/demo.py @@ -0,0 +1,6 @@ +import pandas as pd +import numpy as np + +datas = pd.read_csv("../competition/LincolnTemp.csv") + +print(datas) \ No newline at end of file diff --git a/machine_learning/ml_classifier/__init__.py b/machine_learning/ml_classifier/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/machine_learning/ml_classifier/classification.py b/machine_learning/ml_classifier/classification.py new file mode 100644 index 0000000..8decbba --- /dev/null +++ b/machine_learning/ml_classifier/classification.py @@ -0,0 +1,266 @@ +import matplotlib.pyplot as plt +import pandas as pd +from scipy.stats import ttest_ind +from scipy.stats import f_oneway +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.linear_model import LogisticRegression + +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import BernoulliNB +from sklearn.naive_bayes import GaussianNB +from sklearn.naive_bayes import MultinomialNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from statsmodels.formula.api import ols +from statsmodels.sandbox.stats.multicomp import MultiComparison +from statsmodels.stats.anova import anova_lm +from statsmodels.stats.multitest import multipletests + + +import numpy as np + + +# 这个类使用了反射,比较难。 +class ClassifierCollection: + + ## KNN + @staticmethod + def KNN(X, y, XX): # X,y 分别为训练数据集的数据和标签,XX为测试数据 + """ + KNN 分类 + :param X: + :param y: + :param XX: + :return: + """ + model = KNeighborsClassifier(n_neighbors=10) # 默认为5 + model.fit(X, y) + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def SVM(X, y, XX): + """ + SVM 分类 + :param X: + :param y: + :param XX: + :return: + """ + model = SVC(C=5.0) + model.fit(X, y) + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def LR(X, y, XX): + """ + LogisticRegression + :param X: + :param y: + :param XX: + :return: + """ + model = LogisticRegression(solver='lbfgs', max_iter=1000) + model.fit(X, y) + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def CART(X, y, XX): + """ + 决策树(CART) + :param X: + :param y: + :param XX: + :return: + """ + model = DecisionTreeClassifier() + model.fit(X, y) + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def CARTTuning(X, y, XX): + """ + 决策树(CART) + :param X: + :param y: + :param XX: + :return: + """ + model = DecisionTreeClassifier(random_state=42, max_depth=10, max_leaf_nodes=120) + model.fit(X, y) + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def RF(X, y, XX): + """ + 随机森林 + :param X: + :param y: + :param XX: + :return: + """ + model = RandomForestClassifier() + + model.fit(X, y) + # print("RandomForestClassifier 使用的分类器") + # print(model.estimators_) + # print("RandomForestClassifier 使用的分类器") + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def RFTuning(X, y, XX): + """ + 随机森林 + :param X: + :param y: + :param XX: + :return: + """ + model = RandomForestClassifier(random_state=42, max_depth=10, max_leaf_nodes=120) + + model.fit(X, y) + # print("RandomForestClassifier 使用的分类器") + # print(model.estimators_) + # print("RandomForestClassifier 使用的分类器") + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def GBDT(X, y, XX): + """ + (Gradient Boosting Decision Tree) + :param X: + :param y: + :param XX: + :return: + """ + model = GradientBoostingClassifier() + # https://blog.csdn.net/tuanzide5233/article/details/104234246 + + model.fit(X, y) + # print("GradientBoostingClassifier 使用的分类器") + # print(model.estimators_) + # print("GradientBoostingClassifier 使用的分类器") + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def GBDTTuning(X, y, XX): + """ + (Gradient Boosting Decision Tree) + :param X: + :param y: + :param XX: + :return: + """ + model = GradientBoostingClassifier(random_state=42, + max_depth=10, + max_leaf_nodes=120, + n_estimators=100) + # https://blog.csdn.net/tuanzide5233/article/details/104234246 + + model.fit(X, y) + # print("GradientBoostingClassifier 使用的分类器") + # print(model.estimators_) + # print("GradientBoostingClassifier 使用的分类器") + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def GNB(X, y, XX): + """ + 基于高斯分布求概率 + :param X: + :param y: + :param XX: + :return: + """ + model = GaussianNB() + model.fit(X, y) + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def MNB(X, y, XX): + """ + 基于多项式分布求概率 + :param X: + :param y: + :param XX: + :return: + """ + model = MultinomialNB() + model.fit(X, y) + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def BNB(X, y, XX): + """ + 基于伯努利分布求概率 + :param X: + :param y: + :param XX: + :return: + """ + model = BernoulliNB() + model.fit(X, y) + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def AdaBoost(X, y, XX): + """ + AdaBoost分类器 + :param X: + :param y: + :param XX: + :return: + """ + # 要把随机森林、GBDT、AdaBoost的弱分类器设置成CART 2023年02月17日 [DONE] + # ![soRpwZ](https://oss.images.shujudaka.com/uPic/soRpwZ.png) + model = AdaBoostClassifier(n_estimators=10, random_state=0) + + # AdaBoostClassifier默认使用CART分类树DecisionTreeClassifier, + # 而AdaBoostRegressor默认使用CART回归树DecisionTreeRegressor。 + model.fit(X, y) + # print("AdaBoostClassifier 使用的分类器") + # print(model.estimators_) + # print("AdaBoostClassifier 使用的分类器") + predicted = model.predict(XX) + return predicted, model + + @staticmethod + def AdaBoostTuning(X, y, XX): + """ + AdaBoost分类器 + :param X: + :param y: + :param XX: + :return: + """ + # 要把随机森林、GBDT、AdaBoost的弱分类器设置成CART 2023年02月17日 [DONE] + # ![soRpwZ](https://oss.images.shujudaka.com/uPic/soRpwZ.png) + model = AdaBoostClassifier(random_state=42, + base_estimator=DecisionTreeClassifier(max_depth=10,max_leaf_nodes=120)) + + # AdaBoostClassifier默认使用CART分类树DecisionTreeClassifier, + # 而AdaBoostRegressor默认使用CART回归树DecisionTreeRegressor。 + model.fit(X, y) + # print("AdaBoostClassifier 使用的分类器") + # print(model.estimators_) + # print("AdaBoostClassifier 使用的分类器") + predicted = model.predict(XX) + return predicted, model + + + + diff --git a/machine_learning/ml_regression/__init__.py b/machine_learning/ml_regression/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/machine_learning/ml_regression/regression.py b/machine_learning/ml_regression/regression.py new file mode 100644 index 0000000..7521be3 --- /dev/null +++ b/machine_learning/ml_regression/regression.py @@ -0,0 +1,361 @@ +# 回归算法在scikit-learn中的使用方法: +# +# 主要参考网页:https://ster.im/py_sklearn_1/ +# 基础模型: +# +# * 线性回归(包含岭回归、Lasso回归、弹性网络回归) +# * 树回归 +# * 支持向量机回归 +# * K近邻回归 +# +# 集成模型: +# +# * 随机森林回归 +# * 极端随机树回归 +# * AdaBoost回归 +# * Gradient Boosting回归 + +from sklearn.linear_model import LinearRegression +from sklearn.linear_model import RidgeCV +from sklearn.linear_model import LassoCV +from sklearn.linear_model import ElasticNetCV +from sklearn.tree import DecisionTreeRegressor +from sklearn.svm import SVR +from sklearn.neighbors import KNeighborsRegressor +from sklearn.ensemble import BaggingRegressor +from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import ExtraTreesRegressor +from sklearn.ensemble import AdaBoostRegressor +from sklearn.ensemble import GradientBoostingRegressor + + + +# 这个类使用了反射,比较难。 +class RegressorCollection: + @staticmethod + def Linear(X_train,y_train,X_test,y_test=None): + """ + 最小二乘法线性回归 + 最基本的线性回归法,它接收如下的几个参数: + + fit_intercept:是否考察截距项b,默认为True。 + normalize:是否先对数据进行Z-score标准化,默认为False。 + copy_X:默认为True则复制X,否则直接在原X上覆写。 + n_jobs:使用的处理器核数,默认None表示单核。 + """ + reg = LinearRegression(fit_intercept=True, copy_X=True, n_jobs=None) + reg.fit(X_train, y_train) + if y_test is None: # 如果为 None 的话,表示操作,否则是测试。 + predicted = reg.predict(X_test) + return predicted,reg + else: + reg.score(X_test, y_test) # 回归模型score返回的是R方,下同 + # 各特征的系数w + print("各特征的系数w") + print(reg.coef_) + # 截距b + print("截距b") + print(reg.intercept_) + return None,None + + @staticmethod + def Ridge(X_train,y_train,X_test,y_test=None): + """ + 岭回归 + 带L2正则项的线性回归,相比LinearRegression主要多一个正则项系数 + α + 的参数。 + + 与Ridge相比,RidgeCV内置了交叉验证,会自动帮我们筛出 + α + 的最优解,省去了超参数调试的麻烦,因此通常采用后者。 + """ + reg = RidgeCV(alphas=(0.1, 1.0, 10.0), fit_intercept=True, + scoring=None, cv=5, gcv_mode=None, store_cv_values=False) + reg.fit(X_train, y_train) + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + # 正则项系数alpha + print("正则项系数alpha") + print(reg.alpha_) + return None,None + + @staticmethod + def Lasso(X_train,y_train,X_test,y_test=None): + """ + 带L1正则项的线性回归,常用来估计稀疏参数的高维线性模型。 + + 供有Lasso、LassoCV、LassoLars、LassoLarsCV、LassoLarsIC五种可供选择, + 带CV的即自动选择最优的正则项系数,带Lars的采用最小角回归法而不带Lars的采用坐标轴 + 下降法进行损失函数优化。LassoLarsIC采用AIC(Akaike信息准则)或BIC(Bayes信息准则) + 确定正则项系数。在大多数回归任务中,首选LassoCV,次选LassoLarsCV。 + """ + reg = LassoCV(eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, + precompute="auto", max_iter=1000, tol=0.0001, + copy_X=True, cv=5, verbose=False, n_jobs=None, + positive=False, random_state=None, selection="cyclic") + reg.fit(X_train, y_train) + + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + return None,None + + @staticmethod + def Elastic(X_train,y_train,X_test,y_test=None): + + """ + 弹性网络回归 + 同时带有L1和L2正则项的线性回归,使用l1_ratio这一权重参数来分配L1和L2正则项的比重。 + 常用ElasticNetCV,它会自动选择正则项系数和平衡权重。 + """ + reg = ElasticNetCV(l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, + fit_intercept=True, precompute="auto", + max_iter=1000, tol=0.0001, cv=5, copy_X=True, verbose=0, + n_jobs=None, positive=False, random_state=None, selection="cyclic") + reg.fit(X_train, y_train) + + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + return None, None + + @staticmethod + def DecisionTree(X_train,y_train,X_test,y_test=None): + """ + 树回归 + CART用于回归时,参数与分类器类似,它可以接收如下的参数: + + criterion:分枝的标准,默认"mse"为均方差,可选"friedman_mse"(Friedman均方差)或者"mae"(绝对平均误差)。通常采用默认值。 + splitter:分枝的策略,默认"best"在所有划分点中找出最优的划分点,适合样本量不大的情况。样本量巨大时建议选择"random",在部分划分点中找局部最优的划分点。 + max_depth:限制树的最大深度,默认值为None。如果样本和特征很多时可以适当限制树的最大深度。 + min_samples_split:分割一个节点所需的最小样本数,默认为2,当样本量非常大时可以增加这个值。 + min_samples_leaf:叶节点上所需的最小样本数,叶节点样本数少于这个值时会被剪枝。默认为1,当样本量非常大时可以增加这个值。 + min_weight_fraction_leaf:叶节点样本权重和所需的最小值,默认为0即视样本具有相同的权重。 + max_features:分枝时考虑的特征数量最大值,默认"auto"即该值等于特征数量。可以指定整数或者浮点数(表示占特征总数的比例)。也可选"sqrt"(特征数的开根)、"log2"(特征数的对数)、None(等于特征数)。如果特征数较多可以考虑限制以加快模型拟合。 + random_state:随机数种子。 + max_leaf_nodes:叶节点数最大值,默认None不对叶节点数量做限制,如果特征较多可以加以限制。 + min_impurity_decrease:默认为0.,如果分枝导致不纯度的减少大于等于该值,则节点将被分枝。 + min_impurity_split:默认为1e-7,如果某节点的不纯度超过这个阈值,则该节会分枝,否则该节点为叶节点。 + presort:是否对数据进行预排序,以加快寻找最佳分割点。默认为False。当使用小数据集或对深度作限制时,设置为True可能会加速训练,但对于大型数据集则反而会变慢。 + 我们超参数调优的主要对象为max_depth、min_samples_split、min_samples_leaf、max_features。 + """ + # The 'criterion' parameter of DecisionTreeRegressor + # must be a str among + # {'friedman_mse', 'poisson', 'absolute_error', 'squared_error'}. + # Got 'mse' instead. + reg = DecisionTreeRegressor(criterion="friedman_mse", splitter="best", max_depth=None, + min_samples_split=2, min_samples_leaf=1, + min_weight_fraction_leaf=0.0, max_features=None, + random_state=None, max_leaf_nodes=None, + min_impurity_decrease=0.0) + reg.fit(X_train, y_train) + + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + return None, None + + @staticmethod + def SVR(X_train,y_train,X_test,y_test=None): + """ + 支持向量机回归 + 部分参数如下: + + kernel:核函数,默认使用"rbf"径向基函数,可选"linear"、"poly"、"sigmoid"、"precomputed"或者一个可调用的函数。 + degree:多项式核函数的维度d,仅在核函数选择"poly"时有效。默认值为3。 + gamma:"rbf"、"poly"、"sigmoid"的系数gamma,默认为"auto",取特征数量的倒数。 + coef0:核函数中的独立项,仅在核函数选择"poly"、"sigmoid"时有效。默认值为0.0。 + tol:停止训练的误差精度,默认值为1e-3。 + C:惩罚系数C,默认值为1.0。 + max_iter:最大迭代次数,默认为-1即无限制。 + 最重要的两个调参对象是gamma和C。gamma越大,支持向量越少,gamma越小,支持向量越多。C可理解为逻辑回归中正则项系数lambda的倒数,C过大容易过拟合,C过小容易欠拟合。通常采用网格搜索法进行调参。 + """ + reg = SVR(kernel="rbf", degree=3, gamma="auto", coef0=0.0, + tol=0.001, C=1.0, epsilon=0.1, shrinking=True, + cache_size=200, verbose=False, max_iter=-1) + reg.fit(X_train, y_train) + + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + return None, None + + @staticmethod + def KNN(X_train,y_train,X_test,y_test=None): + """ + K近邻回归 + 部分参数如下: + + n_neighbors:最近邻单元的个数K。 + weights:是否考虑邻居的权重,默认值"uniform"视每个邻居的权重相等,"distance"则给较近的单元更大的权重(取距离的倒数),也可以指定一个可调用的函数。 + algorithm:计算最近邻的算法,默认"auto"自动挑选模型认为最合适的,可选"ball_tree"、"kd_tree"、"brute"。 + leaf_size:叶节点数量,默认值30,只有在algorithm选择球树或者KD树时有效。 + p:闵式距离的度量,p=1时为曼哈顿距离,p=2时为欧式距离(默认)。 + n_neighbors是最需要关注的超参数,其次weights和p也可以适当调整。 + """ + reg = KNeighborsRegressor(n_neighbors=5, weights="uniform", algorithm="auto", + leaf_size=30, p=2, metric="minkowski", metric_params=None) + reg.fit(X_train, y_train) + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + return None, None + + @staticmethod + def Bagging(X_train,y_train,X_test,y_test=None): + """ + 集成回归模型:Bagging + Bagging回归 + 参数: + + base_estimator:基模型,默认None代表决策树,可选择其它基础回归模型对象。 + n_estimators:基模型的数量,默认为10。 + max_samples:用于训练基模型的从X_train中抽取样本的数量,可以是整数代表数量,也可以是浮点数代表比例,默认为1.0。 + max_features:用于训练基模型的从X_train中抽取特征的数量,可以是整数代表数量,也可以是浮点数代表比例,默认为1.0。 + bootstrap:对于样本是否有放回抽样,默认为True。 + bootstrap_features:对于特征是否有放回抽样,默认为False。 + oob_score:是否使用包外样本估计泛化误差。 + warm_start:默认为False,如果选择True,下一次训练以上一次模型的参数为初始参数。 + 对于所有的集成模型,最需要关注的超参数是n_estimators,即基模型的数量,通常需要使用网格搜索法寻找最优解;其他的参数通常保持默认即可取得较好的效果。 + """ + reg = BaggingRegressor(base_estimator=None, n_estimators=10, max_samples=1.0, + max_features=1.0, bootstrap=True, bootstrap_features=False, + oob_score=False, warm_start=False, random_state=None, verbose=0) + reg.fit(X_train, y_train) + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + return None, None + + @staticmethod + def RF(X_train,y_train,X_test,y_test=None): + """ + 随机森林回归 + 参数: + + n_estimators:树的数量,默认为10。 + criterion:分枝的标准,默认"mse"为均方差,可选"mae"(绝对平均误差)。 + max_depth:限制树的最大深度,默认值为None,表示一直分枝直到所有叶节点都是纯的,或者所有叶节点的样本数小于min_samples_split。 + min_samples_split:分割一个节点所需的最小样本数,默认为2。 + min_samples_leaf:叶节点上所需的最小样本数,叶节点样本数少于这个值时会被剪枝。默认为1。 + min_weight_fraction_leaf:叶节点样本权重和所需的最小值,默认为0即视样本具有相同的权重。 + max_features:分枝时考虑的特征数量最大值,默认"auto"即该值等于特征数量。可以指定整数或者浮点数(表示占特征总数的比例)。也可选"sqrt"(特征数的开根)、"log2"(特征数的对数)、None(等于特征数)。 + max_leaf_nodes:叶节点数最大值,默认None不对叶节点数量做限制。 + min_impurity_decrease:默认为0,如果分枝导致不纯度的减少大于等于该值,则节点将被分枝。 + min_impurity_split:默认为1e-7,如果某节点的不纯度超过这个阈值,则该节会分枝,否则该节点为叶节点。 + bootstrap:对于样本是否有放回抽样,默认为True。如果为False,则使用整个数据集构建每个树。 + oob_score:是否使用包外样本估计R方。默认为False。 + random_state:随机数种子。 + warm_start:默认为False,如果选择True,下一次训练以上一次模型的参数为初始参数。 + 除了n_estimators之外,还可以考虑适当调整max_depth、min_samples_split、min_samples_leaf、max_features这些决策树的参数。 + """ + # The 'criterion' parameter of RandomForestRegressor + # must be a str among {'friedman_mse', 'poisson', 'absolute_error', 'squared_error'}. Got 'mse' instead. + reg = RandomForestRegressor(n_estimators=10, criterion="friedman_mse", max_depth=None, + min_samples_split=2, min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_leaf_nodes=None, min_impurity_decrease=0.0, + bootstrap=True, oob_score=False, + random_state=None, verbose=0, warm_start=False) + reg.fit(X_train, y_train) + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + # 各特征的重要性 + reg.feature_importances_ + print("# 各特征的重要性") + print(reg.feature_importances_) + return None, None + + @staticmethod + def ExtraTree(X_train,y_train,X_test,y_test=None): + """ + 极端随机树回归 + Extra Tree和随机森林的区别较小,参数几乎一致。 + """ + # The 'criterion' parameter of ExtraTreesRegressor must be a str among + # {'poisson', 'squared_error', 'absolute_error', 'friedman_mse'}. Got 'mse' instead. + reg = ExtraTreesRegressor(n_estimators=10, criterion="friedman_mse", max_depth=None, + min_samples_split=2, min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_leaf_nodes=None, min_impurity_decrease=0.0, + bootstrap=False, oob_score=False, + random_state=None, verbose=0, warm_start=False) + reg.fit(X_train, y_train) + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + return None, None + + @staticmethod + def ADA(X_train,y_train,X_test,y_test=None): + """ + AdaBoost回归 + 参数: + + base_estimator:弱回归学习器,可指定为任意回归模型对象,默认为None,即DecisionTreeRegressor(max_depth=3)。 + n_estimators:最大迭代次数,即弱学习器的最大个数,默认为50。 + learning_rate:每个弱学习器的权重缩减系数,介于0.和1.之间,默认为1.。 + loss:每次迭代后更新权重时采用的损失函数,默认为"linear",可选"square"、"exponential",通常使用默认值。 + random_state:随机数种子。 + n_estimators和learning_rate两个参数相互牵制,通常会一起进行调参。 + """ + reg = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=3), n_estimators=50, + learning_rate=1.0, loss="linear") + reg.fit(X_train, y_train) + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + return None, None + + @staticmethod + def GB(X_train,y_train,X_test,y_test=None): + """ + Gradient Boosting回归 + 其中决策树部分的参数不列举。 + + loss:损失函数,默认值"ls"代表最小二乘回归,可选"lad"(最小绝对偏差)、"huber"(前两者的结合)和"quantile"(分位数回归)。 + learning_rate:每棵树的权重缩减系数,默认为0.1,与n_estimators相互牵制,是调参的重点。 + n_estimators:最大迭代次数,默认为100。 + subsample:子采样率,用于训练每棵树的样本占样本总数的比例,默认为1.0,如使用小于1.0的值,该模型就为随机梯度提升,会减少方差、增大偏差。 + init:默认为None,可指定具有fit和predict方法的预测器对象,它用于初始化参数。 + """ + # The 'loss' parameter of GradientBoostingRegressor must be a str among + # {'squared_error', 'huber', 'quantile', 'absolute_error'}. Got 'ls' instead. + reg = GradientBoostingRegressor(loss="squared_error", learning_rate=0.1, n_estimators=100, + subsample=1.0, criterion="friedman_mse", min_samples_split=2, + min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, + min_impurity_decrease=0.0, init=None, + random_state=None, max_features=None, alpha=0.9, verbose=0, + max_leaf_nodes=None, warm_start=False, + validation_fraction=0.1, n_iter_no_change=None, tol=0.0001) + reg.fit(X_train, y_train) + if y_test is None: + predicted = reg.predict(X_test) + return predicted, reg + else: + reg.score(X_test, y_test) + return None, None + diff --git a/machine_learning/ml_result/__init__.py b/machine_learning/ml_result/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/machine_learning/ml_result/ml_caller.py b/machine_learning/ml_result/ml_caller.py new file mode 100644 index 0000000..3f79f88 --- /dev/null +++ b/machine_learning/ml_result/ml_caller.py @@ -0,0 +1,362 @@ +# 一个调度和输出的类 + +from sklearn.model_selection import learning_curve # 导入学习曲线类 +from machine_learning.model_dump_load import dump_model +from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from matplotlib.font_manager import FontProperties +# !wget http://tianchi-media.oss-cn-beijing.aliyuncs.com/DSW/Python/miniproject/01_draw_moon_cake/Alibaba-PuHuiTi-Medium.ttf +from machine_learning.ml_classifier.classification import * +from machine_learning.ml_regression.regression import * + +font_set = FontProperties(fname=r"../../Alibaba-PuHuiTi-Medium.ttf", size=12) ##可以自由下载字体使用 + + +# classification_model_methods = ['KNN', 'SVM', "LR", "CART", "RF", "GBDT", 'GNB', 'BNB' , 'AdaBoost'] +# classification_model_methods = ["CART", "RF", "GBDT", 'AdaBoost'] + +classification_model_methods = ["CART","CARTTuning","RF","RFTuning","GBDT","GBDTTuning",'AdaBoost','AdaBoostTuning'] + +regression_model_methods = ["Linear","Ridge","Lasso","Elastic","DecisionTree",'KNN','Bagging','RF','ExtraTree','ADA','GB' + # "SVR", + ] +def classifier_selection(method, X_train, y_train, X_test): + """ + 要执行的函数的名字 + :param X_test: + :param y_train: + :param X_train: + :param method: + :return: + """ + category = ClassifierCollection() + if hasattr(category, method): # 判断在模块中是否存在这个字符串 + target_func = getattr(category, method) # 获取引用 + return target_func(X_train, y_train, X_test) # 执行 + +def regressor_selection(method, X_train, y_train, X_test): + """ + 要执行的函数的名字 + :param X_test: + :param y_train: + :param X_train: + :param method: + :return: + """ + category = RegressorCollection() + if hasattr(category, method): # 判断在模块中是否存在这个字符串 + target_func = getattr(category, method) # 获取引用 + return target_func(X_train, y_train, X_test) # 执行 + +def outputCLSScoreInConsole(accScoreDict, recallScoreDict, f1ScoreDict, precisionScoreDict): + """ + 在控制台中输出分类的结果 + :param accScoreDict acc 字典 + :param recallScoreDict 召回字典 + :param f1ScoreDict f1字典 + :param precisionScoreDict 精确率字典 + """ + print("准确率 ",end='') + print(accScoreDict) + + print("召回率 ",end='') + print(recallScoreDict) + + print("F1 ",end='') + print(f1ScoreDict) + + print("精准率 ",end='') + print(precisionScoreDict) + + +def cls_ml_scores(X_train, y_train, X_test, y_test): + """ + 使用了反射,可能较难理解,为不同的自定义分类方法,不同的数据填充方法,不同的分类方法打分 + :param task 任务类型,可以从 cls 分类,rgs 回归,2 个当中选择,下面会有判断。 + :param X_train: + :param y_train: + :param X_test: + :param y_test: + :return: + """ + + # https://blog.csdn.net/sinat_26917383/article/details/75199996 + accScoreDict = {} + recallScoreDict = {} + f1ScoreDict = {} + precisionScoreDict = {} + + # 使用反射对 8 种分类方法进行运行,并计算分数 + for method in classification_model_methods: + print("Classfication method----->", method) + print("--" * 30) + print("X_train Shpae", X_train.shape) + print("X_test Shape", X_test.shape) + print("y_train Shape", y_train.shape) + print("y_test Shape", y_test.shape) + print("--"*30) + + # 返回预测值和模型 + _predicted, model = classifier_selection(method, X_train, y_train, X_test) + + + # 将模型保存起来 + dump_model(model, 'cls_'+method) + + # 各种分数 + # https://blog.csdn.net/lyb3b3b/article/details/84819931 + accScoreDict[method] = accuracy_score(y_test, _predicted) # 准确率,正确率(accuracy) + # ValueError: Target is multiclass but average='binary'. + # Please choose another average setting, one of [None, 'micro', 'macro', 'weighted']. + recallScoreDict[method] = recall_score(y_test, _predicted, average='micro') # 召回率 + f1ScoreDict[method] = f1_score(y_test, _predicted, average='micro') # F1 + precisionScoreDict[method] = precision_score(y_test, _predicted, average='micro') # 精准率 + # 各种分数 + + print("当前运行算法", method) + outputCLSScoreInConsole(accScoreDict, recallScoreDict, f1ScoreDict, precisionScoreDict) + + # 各种分数 + acc_scores = list() + recall_scores = list() + f1_scores = list() + precision_scores = list() + for method in classification_model_methods: + acc_scores.append(accScoreDict[method]) + recall_scores.append(recallScoreDict[method]) + f1_scores.append(f1ScoreDict[method]) + precision_scores.append(precisionScoreDict[method]) + # 各种分数 + + scoresDF = pd.DataFrame().from_dict({ + 'method': classification_model_methods, + 'acc_score': acc_scores, + 'recall_score': recall_scores, + 'f1_score': f1_scores, + 'precision_score': precision_scores + }) + + scoresDF.sort_values('acc_score', inplace=True) + + # 保存各种方法下的分类准确率,为之后的集成学习对比做准备。 + scoresDF.to_csv( + './results-storage/classification_results/classfication.csv') + + # https://jakevdp.github.io/PythonDataScienceHandbook/04.01-simple-line-plots.html + # Draw plot + import matplotlib.patches as patches + # import seaborn as sns + # + # plots = sns.barplot(x="method", y="score", data=scoresDF) + # + # # Iterating over the bars one-by-one + # for bar in plots.patches: + # # Using Matplotlib's annotate function and + # # passing the coordinates where the annotation shall be done + # plots.annotate(format(bar.get_height(), '.2f'), + # (bar.get_x() + bar.get_width() / 2, bar.get_height()), + # ha='center', va='center', + # size=15, xytext=(0, 5), + # textcoords='offset points') + plt.ylim(0, 1.2); + plt.plot(scoresDF['method'], scoresDF['acc_score'], color='blue', label='acc_score') + plt.plot(scoresDF['method'], scoresDF['recall_score'], color='g', label='recall_score') + plt.plot(scoresDF['method'], scoresDF['f1_score'], color='#FFDD44', label='f1_score') + plt.plot(scoresDF['method'], scoresDF['precision_score'], color='0.75', label='precision_score') + plt.xlabel("Method") + plt.ylabel("Score") + + # Title, Label, Ticks and Ylim + plt.title('Bar Chart for cls' ,fontdict={'size': 22}) + + # Add patches to color the X axis labels + # p1 = patches.Rectangle((.57, -0.005), width=.33, height=.13, alpha=.1, facecolor='green', transform=fig.transFigure) + # p2 = patches.Rectangle((.124, -0.005), width=.446, height=.13, alpha=.1, facecolor='red', transform=fig.transFigure) + # fig.add_artist(p1) + # fig.add_artist(p2) + fileName = './results-storage/charts/score-barcharts/BarChartfor-cls' + ".png" + plt.savefig(fileName) + plt.show() + + return fileName + + +def plot_learn_curve(task,X_train, y_train, X_test): + """ + 使用了反射,可能较难理解,为不同的自定义分类方法,不同的数据填充方法,不同的分类方法打分。 + 学习曲线(learning curve)是一种用于评估机器学习算法表现的图表,它通过展示训练样本数量与算法性能的关系, + 帮助我们判断算法的过拟合和欠拟合情况。对于线性回归算法,学习曲线同样适用,它可以帮助我们找到最佳的模型参数。 + + 线性回归算法的学习曲线通常表现为:随着训练数据量的增加,训练误差和测试误差都会逐渐减小,且趋于稳定。当训练集 + 误差和测试集误差之间的差距越来越小时,我们就可以认为模型已经学习到了最优解。 + 因此,学习曲线对于线性回归算法是非常适用的,它可以帮助我们直观地观察模型的学习情况,并确定最佳的模型参数和训练集大小。 + :param X_train: + :param y_train: + :param X_test + :return: + """ + + scoreDict = {} + methods = list() + if task =='cls': + methods = classification_model_methods + elif task =='rgs': + methods = regression_model_methods + + # 使用反射对 8 种分类方法进行运行,并计算分数 + for method in methods: + print("*" * 10, '方法', methods, + "--模型方法", method, " learn Curve", "*" * 10) + # 返回预测值和模型 + if task == 'cls': + _predicted, model = classifier_selection(method, X_train, y_train, X_test) + elif task == 'rgs': + _predicted, model = regressor_selection(method, X_train, y_train, X_test) + + plot_lc(X_train, y_train, model, method) + + +def plot_lc(x, y, model, class_model): + """ + :param x: + :param y: + :param model: + :param class_model: 模型 + :return: + """ + fig, ax = plt.subplots(1, 1, figsize=(6, 6)) # 设置画布和子图 + + train_sizes, train_scores, test_scores = learning_curve(model,x,y,cv=20,n_jobs=4) + # 设置分类器为随机森林,x,y,5折交叉验证,cpu同时运算为4个 + ax.set_ylim((0.5, 1.1)) # 设置子图的纵坐标的范围为(0.7~1.1) + ax.set_xlabel("模型" + class_model) # 设置子图的x轴名称 + ax.set_ylabel("score") + ax.grid() # 画出网图 + ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color='r', label='train score') + # 画训练集数据分数,横坐标为用作训练的样本数,纵坐标为不同折下的训练分数的均值 + ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', color='g', label='test score') + ax.legend(loc='best') # 设置图例 + plt.savefig( + "./results-storage/charts/learn_curve/" + class_model + "_learn_curve.png") + plt.show() + +def rgs_ml_scores(X_train, y_train, X_test, y_test): + methods = regression_model_methods + rgs_results = {} + + # 使用反射对 8 种分类方法进行运行,并计算分数 + for method in methods: + print("Regression method----->", method) + print("--"*30) + print("X_train Shpae", X_train.shape) + print("X_test Shape", X_test.shape) + print("y_train Shape", y_train.shape) + print("y_test Shape", y_test.shape) + print("--" * 30) + + # 返回预测值和模型 + _predicted, model = regressor_selection(method, X_train, y_train, X_test) + + + # 将模型保存起来 + dump_model(model, 'rgs_'+method) + + result = lin_regplot(X_train,y_train,X_test,y_test,model) + + rgs_results[method] = result + + return rgs_results # 返回线性回归的结果 + +def lin_regplot(X_train, y_train,X_test,y_test, model): + + y_train_pred = model.predict(X_train) + y_test_pred = model.predict(X_test) + plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data') + plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', label='Test data') + # 预测值与偏差的关系 + plt.xlabel('Predicted values') + plt.ylabel('Residuals') + plt.legend(loc='upper left') + plt.hlines(y = 0, xmin = -10, xmax = 50, lw = 2, color='red') + plt.xlim([-10, 50]) + plt.tight_layout() + plt.show() + + + # 评价线性回归模型的常用指标有以下几种: + # 1. 均方误差(MSE):用于评估模型预测结果的误差大小,计算方法为平均预测值和真实值之差的平方和除以样本数量。MSE越小说明模型表现越好。 + # 2. 决定系数(R2):用于评估模型对数据的拟合程度,其值介于0到1之间。R2越接近1说明模型对数据的拟合程度越好,越接近0说明模型对数据的拟合程度越差。 + # 3. 均方根误差(RMSE):MSE的平方根,用于度量模型预测结果的标准差,即模型预测结果与真实值之间的平均差异。RMSE越小说明模型预测结果越准确。 + # 4. 平均绝对误差(MAE):用于评估模型预测结果的误差大小,计算方法为平均预测值和真实值之差的绝对值之和除以样本数量。MAE越小说明模型表现越好。 + # 通过对这些指标进行评价,可以对线性回归模型的表现和性能有一个较为全面的了解,进而对模型进行优化和改进。 + + # ![U5vnAA](https://oss.images.shujudaka.com/uPic/U5vnAA.png) + from sklearn.metrics import r2_score + from sklearn.metrics import mean_squared_error # 均方误差回归损失 + from sklearn.metrics import mean_absolute_error # + # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error + + mse_y_train = mean_squared_error(y_train, y_train_pred) + mse_y_test = mean_squared_error(y_test, y_test_pred) + + r2_score_y_train = r2_score(y_train, y_train_pred) + r2_socre_y_test = r2_score(y_test, y_test_pred) + + mae_y_train = mean_absolute_error(y_train, y_train_pred) + mae_y_test = mean_absolute_error(y_test, y_test_pred) + + print('MSE train: %.3f, test: %.3f' % (mse_y_train, mse_y_test)) + print('R^2 train: %.3f, test: %.3f' % (r2_score_y_train,r2_socre_y_test )) + print('MAE train: %.3f, test: %.3f' % (mae_y_train,mae_y_test )) + print("\n\n") + + return [mse_y_train,mse_y_test,r2_score_y_train,r2_socre_y_test,mae_y_train,mae_y_test] + + +def outputRGSResults(rgs_result:dict): + + tips = """ +# 评价线性回归模型的常用指标有以下几种: +# 1. 均方误差(MSE):用于评估模型预测结果的误差大小,计算方法为平均预测值和真实值之差的平方和除以样本数量。MSE越小说明模型表现越好。 +# 2. 决定系数(R2):用于评估模型对数据的拟合程度,其值介于0到1之间。R2越接近1说明模型对数据的拟合程度越好,越接近0说明模型对数据的拟合程度越差。 +# 3. 均方根误差(RMSE):MSE的平方根,用于度量模型预测结果的标准差,即模型预测结果与真实值之间的平均差异。RMSE越小说明模型预测结果越准确。 +# 4. 平均绝对误差(MAE):用于评估模型预测结果的误差大小,计算方法为平均预测值和真实值之差的绝对值之和除以样本数量。MAE越小说明模型表现越好。 +# 通过对这些指标进行评价,可以对线性回归模型的表现和性能有一个较为全面的了解,进而对模型进行优化和改进。 + """ + + print(tips) + pd.set_option('display.max_rows', 100) + pd.set_option('display.max_columns', 100) + pd.set_option('display.width', -1) + tempDF = pd.DataFrame.from_dict(rgs_result) + + print(tempDF) + + + plt.subplots_adjust(wspace=1, hspace=1) # 调整子图间距 + plt.figure(figsize=(24, 10)) + plt.subplot(3,2,1) + plt.title('MSE-Y-Train') + plt.plot(tempDF.iloc[0]) + plt.subplot(3, 2, 2) + plt.title('MSE-Y-Test') + plt.plot(tempDF.iloc[1]) + plt.subplot(3, 2, 3) + plt.title('R^2-Y-Train') + plt.plot(tempDF.iloc[2]) + plt.subplot(3, 2, 4) + plt.title('R^2-Y-Test') + plt.plot(tempDF.iloc[3]) + plt.subplot(3, 2, 5) + plt.title('MAE-Y-Train') + plt.plot(tempDF.iloc[4]) + plt.subplot(3, 2, 6) + plt.title('MAE-Y-Test') + plt.plot(tempDF.iloc[5]) + + plt.show() + + + diff --git a/machine_learning/model_dump_load.py b/machine_learning/model_dump_load.py new file mode 100644 index 0000000..0fe641e --- /dev/null +++ b/machine_learning/model_dump_load.py @@ -0,0 +1,30 @@ +# 导入包,无需pip install +import pickle +import joblib + +def dump_model(model, modelFileName): + """ + 将模型保存起来 + :param model: + :param modelFileName: + :return: + """ + # 保存模型,我们想要导入的是模型本身,所以用"wb"方式写入,是二进制方式,DT是模型名字 + pickle.dump(model, open("./models_dump/pickle_" + modelFileName, + "wb")) # open("dtr.dat","wb")意思是打开叫"dtr.dat"的文件,操作方式是写入二进制数据 + # 保存模型 + joblib.dump(model, './models_dump/joblib_' + modelFileName) # 第二个参数只需要写文件名字,是不是比pickle更人性化 + + +def load_model(modelFileName): + """ + 根据模型模型名,加载保存的模型 + :param modelFileName: + :return:pickle,joblib + """ + # 加载模型 + loaded_model = pickle.load(open("./models_dump/pickle_" + modelFileName, "rb")) + # 加载模型 + loaded_model2 = joblib.load("./models_dump/joblib_" + modelFileName) + + return loaded_model, loaded_model2 diff --git "a/pandas_util/Python346円225円260円346円215円256円347円211円271円345円276円201円345円210円206円346円236円2201円-345円210円206円345円270円203円345円210円206円346円236円220円357円274円210円346円236円201円345円267円256円357円274円214円351円242円221円347円216円207円347円233円264円346円226円271円345円233円276円347円255円211円357円274円211円.md" "b/pandas_util/Python346円225円260円346円215円256円347円211円271円345円276円201円345円210円206円346円236円2201円-345円210円206円345円270円203円345円210円206円346円236円220円357円274円210円346円236円201円345円267円256円357円274円214円351円242円221円347円216円207円347円233円264円346円226円271円345円233円276円347円255円211円357円274円211円.md" new file mode 100644 index 0000000..a1f860b --- /dev/null +++ "b/pandas_util/Python346円225円260円346円215円256円347円211円271円345円276円201円345円210円206円346円236円2201円-345円210円206円345円270円203円345円210円206円346円236円220円357円274円210円346円236円201円345円267円256円357円274円214円351円242円221円347円216円207円347円233円264円346円226円271円345円233円276円347円255円211円357円274円211円.md" @@ -0,0 +1,198 @@ +## 数据特征分析分为以下部分: + +1. 分布分析 +2. 对比分析 +3. 统计分析 +4. 帕累托分析 +5. 正态性检验 +6. 相关性分析 + +## 数据: + +![pedxoR](https://oss.images.shujudaka.com/uPic/pedxoR.jpg) + +## 分布分析 + +分布分析 --> 研究数据的分布特征和分布类型,分定量数据、定性数据 + +主要是:极差、频率分布情况、分组组距及组数 + +```python +import numpy as np +import pandas_util as pd +import matplotlib.pyplot as plt +import warnings + +warnings.filterwarnings('ignore') +``` + +```python +#作散点图:横纵轴放经纬度,单价显示大小,总价显示颜色 +data = pd.read_csv('./datas/second_hand_ house.csv') +data.head() +# matplotlib.pyplot.scatter(x, y, s=20, c='b', marker='o', +# cmap=None, norm=None, vmin=None, vmax=None, alpha=None, +# linewidths=None, verts=None, hold=None, **kwargs) +# x,y:表示的是shape大小为(n,)的数组,也就是我们即将绘制散点图的数据点,输入数据。 +# s:表示的是大小,是一个标量或者是一个shape大小为(n,)的数组,可选,默认20。 +# c:表示的是色彩或颜色序列,可选,默认蓝色’b’。但是c不应该是一个单一的RGB数字,也不应该是一个RGBA的序列, +# 因为不便区分。c可以是一个RGB或RGBA二维行数组。 +# b---blue c---cyan g---green k---black +# m---magenta r---red w---white y---byellow +# marker:MarkerStyle,表示的是标记的样式,可选,默认’o’。 +# cmap:Colormap,标量或者是一个colormap的名字,cmap仅仅当c是一个浮点数数组的时候才使用。如果没有申明就是image.cmap,可选,默认None。 +# norm:Normalize,数据亮度在0-1之间,也是只有c是一个浮点数的数组的时候才使用。如果没有申明,就是默认None。 +# vmin,vmax:标量,当norm存在的时候忽略。用来进行亮度数据的归一化,可选,默认None。 +# alpha:标量,0-1之间,可选,默认None。 +# linewidths:也就是标记点的长度,默认None。 + + +plt.scatter(data['经度'],data['纬度'], + s = data['房屋单价']/500, + c = data['参考总价'], + alpha=0.4, + cmap = 'Reds') +plt.grid() +print(data.dtypes) #显示各列类型 +print('------\n数据长度%i条'%len(data)) #输出数据长度 + +``` + + +![cvI5PY](https://oss.images.shujudaka.com/uPic/cvI5PY.jpg) + +## 极差–对定量字段 + +```python +#定义(可以求多列的极差)的函数 +def d_range(df,*cols): + krange = [] + for col in cols: + crange = df[col].max()- df[col].min() + krange.append(crange) + return krange + +key1 = '参考首付' +key2 = '参考总价' +dr = d_range(data,key1,key2) +print('%s的极差为%f \n%s的极差为%f'%(key1,dr[0],key2,dr[1])) +``` + +## 频率分布情况 - 对定量字段 + +1.通过直方图直接判断分组组数 + +```python +#分组做柱状图 +data[key2].hist(bins=10) +#简单查看数据分布,确定分布组数 → 一般8-16即可.这里分10组 +``` + +![CYH7wX](https://oss.images.shujudaka.com/uPic/CYH7wX.jpg) + +## 2.求出分组区间 + +pd.cut() 分箱 + +pd.cut(x,bins,right=True,labels=None,retbins=False,precision=3,include_lowest=False,duplicates=‘raise’) +x : 一维数组 +bins :整数,标量序列或者间隔索引,是进行分组的依据, +如果填入整数n,则表示将x中的数值分成等宽的n份(即每一组内的最大值与最小值之差约相等); +如果是标量序列,序列中的数值表示用来分档的分界值 +right :布尔值,默认为True表示包含最右侧的数值,即区间是左开右闭的 + +**value_counts** 常用于数据表的计数及排序,计算每个不同值有在该列中的个数,同时还能根据需要进行排序。 + +```python +gcut = pd.cut(data[key2],10,right=False) +gcut_count = gcut.value_counts(sort=False) +#在这里不排序 +data['%s分组区间' % key2] = gcut.values +#给原表多加一列,写每列数据在的区间 +print(gcut.head(),'\n------') +print(gcut_count) +data.head() + +``` + +求出目标字段下频率分布的其他统计量 → 频数,频率,累计频率 + +**pd.DataFrame()** 创建DataFrame格式 + +DataFrame是Python中Pandas库中的一种数据结构,它类似excel,是一种二维表。 + +```python +r_zj = pd.DataFrame(gcut_count) +r_zj.rename(columns ={gcut_count.name:'频数'}, inplace = True) # 修改频数字段名 +r_zj['频率'] = r_zj / r_zj['频数'].sum() # 计算频率 +r_zj['累计频率'] = r_zj['频率'].cumsum() # 计算累计频率 +r_zj['频率%'] = r_zj['频率'].apply(lambda x: "%.2f%%" % (x*100)) # 以百分比显示频率 +r_zj['累计频率%'] = r_zj['累计频率'].apply(lambda x: "%.2f%%" % (x*100)) # 以百分比显示累计频率 +r_zj.style.bar(subset=['频率','累计频率'], color='green',width=100) +``` +从上表整理成如下表: + +![CIIAbN](https://oss.images.shujudaka.com/uPic/CIIAbN.jpg) + + +## 绘制频率直方图 + +```python +r_zj['频率'].plot(kind = 'bar',width = 0.8,figsize = (12,2),rot=0,color = 'k',grid=True,alpha = 0.5) +plt.title('参考总价分布频率直方图') + +x = len(r_zj) +y = r_zj['频率'] +m = r_zj['频数'] +for i,j,k in zip(range(x),y,m): + plt.text(i-0.1,j+0.01,'%i'%k,color = 'k') +#添加频数标签 +``` + +![wALSIa](https://oss.images.shujudaka.com/uPic/wALSIa.jpg) + + +## 频率分布情况 - 对定性字段 + +### 1,通过计数统计判断不同类别的频率 + +```python +cx_g = data['朝向'].value_counts(sort=True) +print(cx_g) # 统计频率,且排了序 + +r_cx = pd.DataFrame(cx_g) +r_cx.rename(columns ={cx_g.name:'频数'}, inplace = True) # 修改频数字段名 +r_cx['频率'] = r_cx / r_cx['频数'].sum() # 计算频率 +r_cx['累计频率'] = r_cx['频率'].cumsum() # 计算累计频率 +r_cx['频率%'] = r_cx['频率'].apply(lambda x: "%.2f%%" % (x*100)) # 以百分比显示频率 +r_cx['累计频率%'] = r_cx['累计频率'].apply(lambda x: "%.2f%%" % (x*100)) # 以百分比显示累计频率 +r_cx.style.bar(subset=['频率','累计频率'], color='#d65f5f',width=100) +``` + +![PHKbR5](https://oss.images.shujudaka.com/uPic/PHKbR5.jpg) + +### 2,绘制频率直方图、饼图 + +```python +plt.figure(num = 1,figsize = (12,2)) +r_cx['频率'].plot(kind = 'bar', + width = 0.8, + rot = 0, + color = 'k', + grid = True, + alpha = 0.5) +plt.title('参考总价分布频率直方图') +# 绘制直方图 + +plt.figure(num = 2) +plt.pie(r_cx['频数'], + labels = r_cx.index, + autopct='%.2f%%', + shadow = True) +plt.axis('equal') +# 绘制饼图 +``` + +![uoLzXZ](https://oss.images.shujudaka.com/uPic/uoLzXZ.jpg) + +![W4VLLF](https://oss.images.shujudaka.com/uPic/W4VLLF.jpg) \ No newline at end of file diff --git a/pandas_util/__init__.py b/pandas_util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pandas_util/datas/second_hand_ house.csv b/pandas_util/datas/second_hand_ house.csv new file mode 100644 index 0000000..0f2c84e --- /dev/null +++ b/pandas_util/datas/second_hand_ house.csv @@ -0,0 +1,76 @@ +房屋编码,小区,朝向,房屋单价,参考首付,参考总价,经度,纬度 +605093949,大望新平村,南北,5434,15,50,114.180964,22.603698 +605768856,通宝楼,南北,3472,7.5,25,114.179298,22.56691 +606815561,罗湖区罗芳村,南北,5842,15.6,52,114.158869,22.547223 +605147285,兴华苑,南北,3829,10.8,36,114.15804,22.554343 +606030866,京基东方都会,西南,47222,51,170,114.149243,22.55437 +605610283,水库新村,南北,5897,13.8,46,114.1454697,22.57018661 +601250774,水库新村,南北,8295,21.9,73,114.1454697,22.57018661 +605525982,水库新村,南北,6145,17.7,59,114.1454697,22.57018661 +606810540,新天地名居,南,51282,60,200,114.1407852,22.55086327 +599540811,翠岭苑,南北,11160,30,100,114.1373593,22.59192018 +606693036,松泉公寓,东,38557,60,200,114.1354523,22.58370781 +606348908,钻石时代,南,45833,49.5,165,114.135184,22.54832501 +605140018,东门E公馆,南,11891,31.5,105,114.134773,22.56072914 +606590991,美园,北,51923,40.5,135,114.1346817,22.54956245 +596462998,京基东方华都,南,62500,60,200,114.1343842,22.55811119 +594298847,京基东方华都,东,52631,60,200,114.1343842,22.55811119 +605560931,长丰苑,西北,38888,42,140,114.1342954,22.54779651 +596278133,长丰苑,南,43023,55.5,185,114.1342954,22.54779651 +605665139,金丽豪苑,东,95238,60,200,114.1338196,22.56984901 +604613670,金丽豪苑,南,80000,60,200,114.1338196,22.56984901 +606637625,愉天小区,北,66574,57.9,193,114.1337433,22.57199097 +606637625,愉天小区,北,66574,57.9,193,114.1337433,22.57199097 +606252043,雅园公寓,南北,8928,15,50,114.1337363,22.55559386 +602026329,雅园公寓,东南,5714,12,40,114.1337363,22.55559386 +602117545,东门168,东西,52173,36,120,114.1334229,22.55591774 +606660644,东门168,西南,61000,36.6,122,114.1334229,22.55591774 +599004917,阳光新干线家园,南,48725,58.47,194.9,114.1334152,22.54417419 +605769039,培峰苑,南北,3835,8.4,28,114.1322949,22.59471036 +605769102,培峰苑,南北,3958,11.4,38,114.1322949,22.59471036 +605769039,培峰苑,南北,3835,8.4,28,114.1322949,22.59471036 +605906134,金色都汇,东,48888,52.8,176,114.131928,22.546667 +604329044,缤纷时代家园,南,63879,49.5,165,114.1311035,22.55740738 +603204276,嘉湖新都,东南,89523,56.4,188,114.1310808,22.57252346 +606779885,嘉湖新都,南,64516,60,200,114.1310808,22.57252346 +605628024,嘉湖新都,南,66000,59.4,198,114.1310808,22.57252346 +604870821,湖润大厦,南北,5058,12.9,43,114.1304169,22.55123329 +605702262,湖润大厦,南北,4545,12,40,114.1304169,22.55123329 +590392825,东门天下,东南,55714,58.5,195,114.1286697,22.55532455 +603513631,田贝花园,东南,9911,27,90,114.1281815,22.57090759 +606616471,田贝花园,南北,9693,28.5,95,114.1281815,22.57090759 +606616471,田贝花园,南北,9693,28.5,95,114.1281815,22.57090759 +598334198,田贝花园,南北,8363,13.8,46,114.1281815,22.57090759 +599340816,田贝花园,南北,9552,26.7,89,114.1281815,22.57090759 +599044788,银座金钻,南,41666,52.5,175,114.128142,22.547827 +604872669,置地逸轩,北,50000,58.5,195,114.1273651,22.54327393 +606625129,置地逸轩,南,50000,58.5,195,114.1273651,22.54327393 +601093683,罗湖村,西北,5113,13.5,45,114.125588,22.541119 +604870556,罗湖村,西北,4772,12.6,42,114.125588,22.541119 +606482810,罗湖村,南北,4545,13.5,45,114.125588,22.541119 +606355577,罗湖村,南北,5842,15.6,52,114.125588,22.541119 +601897164,海丰苑,西南,38000,57,190,114.1245873,22.54687355 +605532790,罗湖1号大楼,南北,4777,12.9,43,114.123972,22.546023 +605279416,罗湖1号大楼,南北,6588,16.8,56,114.123972,22.546023 +606729036,友谊大厦,南,36923,57.6,192,114.1237106,22.54456711 +597559191,金田大厦,南北,5582,15.24,50.8,114.1215574,22.54521646 +606578375,虹桥星座,东,47878,47.4,158,114.1205521,22.5762043 +600682443,虹桥星座,东,47878,47.4,158,114.1205521,22.5762043 +601845711,田心村,南北,5370,17.4,58,114.119935,22.573407 +598296969,时尚新居,南,41818,41.4,138,114.1188431,22.5744648 +603983611,祥福雅居,南,48387,45,150,114.1188049,22.57196808 +606543980,西湖大厦,东,30411,51,170,114.1164398,22.56119537 +606535099,风格名苑,东,55806,51.9,173,114.1159592,22.55708122 +595068607,幸福华府,南北,26388,57,190,114.1143646,22.5544281 +606799436,武警七支队大院,南北,3960,12,40,114.111282,22.557374 +594102300,新闻大厦,南,6052,13.8,46,114.1093938,22.54749824 +606719083,星湖花园三期,西南,41578,47.4,158,114.1058044,22.57372665 +603105329,武警家属大院,东南,4444,12,40,114.1005318,22.57568425 +605322083,恒通花园,南北,4245,13.5,45,114.097673,22.570293 +605244548,恒通花园,南,4128,9.66,32.2,114.097673,22.570293 +601116785,恒通花园,南北,3773,12,40,114.097673,22.570293 +598258845,三九花园,南,5833,12.6,42,114.0895386,22.57707977 +594221866,三九花园,南,5681,15,50,114.0895386,22.57707977 +606700179,城市春天,南北,3571,7.5,25,114.083405,22.5395049 +603950517,皇御苑,东北,59701,54,180,114.0817954,22.53139307 +605232094,晨晖家园,南,54285,57,190,114.0676249,22.52550815 diff --git a/pandas_util/feature_analytis.py b/pandas_util/feature_analytis.py new file mode 100644 index 0000000..aee7ee4 --- /dev/null +++ b/pandas_util/feature_analytis.py @@ -0,0 +1,439 @@ +# 特征分析 +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import warnings +import seaborn as sns +from scipy import stats + +warnings.filterwarnings('ignore') +from chinese_calendar import is_workday, is_holiday + + +def feature_distribution(datas, columnName): + """ + 显示DataFrame 中一列的分布情况 + :param datas DataFrame + :param columnName 列名 + """ + + columnOfDataFrame = datas[columnName] + print("########################## Column " + columnName + '##########################') + print('\n\n描述性统计信息,你也可以自己使用下面的方法来做探索') + print(""" +编号 函数 描述 +1 count() 非空观测数量 +2 sum() 所有值之和 +3 mean() 所有值的平均值 +4 median() 所有值的中位数 +5 mode() 值的模值 +6 std() 值的标准偏差 +7 min() 所有值中的最小值 +8 max() 所有值中的最大值 +9 abs() 绝对值 +10 prod() 数组元素的乘积 +11 cumsum() 累计总和 +12 cumprod() 累计乘积 + """) + print("---" + columnName + " 描述性信息统计---") + print(columnOfDataFrame.describe(include='all')) + + print('\n\n' + columnName + " 列中的唯一值和数量如下:\n") + print(datas[columnName].value_counts()) + + # 散点分布 + plt.title('Scatter') + plt.scatter(np.arange(len(columnOfDataFrame)), columnOfDataFrame, alpha=0.4, cmap='Reds') + plt.grid() + plt.show() + # 散点分布 + plt.title("Hist") + sns.distplot(columnOfDataFrame) + plt.show() + + """ + kstest方法:KS检验,参数分别是:待检验的数据,检验方法(这里设置成norm正态分布),均值与标准差 + 结果返回两个值:statistic → D值,pvalue → P值 + p值大于0.05,为正态分布 + H0:样本符合 + H1:样本不符合 + 如何p>0.05接受H0 ,反之 + """ + + print(""" +kstest方法:KS检验, + +参数分别是:待检验的数据,检验方法(这里设置成norm正态分布),均值与标准差 + +结果返回两个值:statistic → D值,pvalue → P值 + +p 值大于0.05,为正态分布 H0:样本符合 H1:样本不符合 如何 p>0.05 接受H0 ,反之 + """) + try: + u = columnOfDataFrame.mean() + std = columnOfDataFrame.std() + result = stats.kstest(columnOfDataFrame, 'norm', (u, std)) + print(result) + + print("变量极差", end='\t') + print("Max(%f)-Min(%f) = %f" % ( + columnOfDataFrame.max(), columnOfDataFrame.min(), columnOfDataFrame.max() - columnOfDataFrame.min())) + except Exception as e: + print(e) + pass + print("---频率分布情况---") + plt.title("Frequency Distribution Bin10") + columnOfDataFrame.hist(bins=10) + plt.show() + plt.title("Frequency Distribution Bin50") + columnOfDataFrame.hist(bins=50) + plt.show() + + print("---分组区间---") + gcut = pd.cut(columnOfDataFrame, 10, right=False) + gcut_count = gcut.value_counts(sort=False) + # 在这里不排序 + # columnOfDataFrame['分组区间'] = gcut.values + # 给原表多加一列,写每列数据在的区间 + print(gcut.head(), '\n------') + print(gcut_count) + print(columnOfDataFrame.head()) + + r_zj = pd.DataFrame(gcut_count) + r_zj.rename(columns={gcut_count.name: '频数'}, inplace=True) # 修改频数字段名 + r_zj['频率'] = r_zj / r_zj['频数'].sum() # 计算频率 + r_zj['累计频率'] = r_zj['频率'].cumsum() # 计算累计频率 + r_zj['频率%'] = r_zj['频率'].apply(lambda x: "%.2f%%" % (x * 100)) # 以百分比显示频率 + r_zj['累计频率%'] = r_zj['累计频率'].apply(lambda x: "%.2f%%" % (x * 100)) # 以百分比显示累计频率 + r_zj.style.bar(subset=['频率', '累计频率'], color='green', width=100) + # pd.set_option("max_columns", None) # Showing only two columns + # pd.set_option("max_rows", None) + print("---输出频*表---") + print(r_zj) + + r_zj['频率'].plot(kind='bar', width=0.8, figsize=(12, 2), rot=0, color='k', grid=True, alpha=0.5) + plt.title('Distribution Hist') + x = len(r_zj) + y = r_zj['频率'] + m = r_zj['频数'] + for i, j, k in zip(range(x), y, m): + plt.text(i - 0.1, j + 0.01, '%i' % k, color='k') + # 添加频数标签 + plt.show() + + plt.pie(r_zj['频数'], + labels=r_zj.index, + autopct='%.2f%%', + shadow=True) + plt.axis('equal') + plt.show() + + print("---箱线图---") + print(""" +简单直观的异常值检测方法:箱形图(箱线图) +箱形图中,从上到下依次有 6 个数据节点,分别是上界、上四分位、均值、中位数、下四分位、下界。而那些超过上界的值就会被标记为离群点,也就是异常数据。 + """) + not_null = pd.to_numeric(columnOfDataFrame, errors='coerce') + print(not_null) + plt.boxplot(not_null) + plt.show() + + print('\n\n') + + +def plot_scatter(datas, colX, colY, colHue): + """ + 显示散点图 + :param datas dataframe + :param colX X 轴列,列名 + :param colY Y 轴列,列名 + :param colHue 数据显示列,字符串类型 + """ + import seaborn as sns + import matplotlib.pyplot as plt + sns.lmplot(x=colX, y=colY, + data=datas, + hue=colHue, + fit_reg=False) + plt.xlabel(colX) + plt.ylabel(colY) + plt.title(colHue + 'Scatter Plot for ' + colX + " & " + colY) + plt.show() + + +def count_unique(datas, cols): + for col in cols: + print('\n\n' + col + " 列中的唯一值和数量如下:\n") + print(datas[col].value_counts()) + + +# plot_bars(auto_prices, ['fuel_type']) +# plot_cols = ['make', 'body_style', 'num_of_cylinders'] +# plot_bars(auto_prices, plot_cols) +def plot_bars(datas, cols): + for col in cols: + fig = plt.figure(figsize=(6, 6)) # 定义绘图区域 + ax = fig.gca() # 定义轴axis + counts = datas[col].value_counts() # 找到每个唯一类别的计数 + counts.plot.bar(ax=ax, color='blue') # 在计数数据框上使用 plot.bar 方法 + ax.set_title('Number of by' + col) # 给一个主标题 + ax.set_xlabel(col) # 设置 x 轴的文本 + ax.set_ylabel('Numbers') # 为 y 轴设置文本 + plt.show() + + +# num_cols = ['curb_weight', 'engine_size', 'city_mpg', 'price'] +# plot_histogram(auto_prices, num_cols) +def plot_histogram(datas, cols, bins=10): + for col in cols: + fig = plt.figure(figsize=(6, 6)) # define plot area + ax = fig.gca() # define axis + datas[col].plot.hist(ax=ax, bins=bins) # Use the plot.hist method on subset of the data frame + ax.set_title('Histogram of ' + col) # Give the plot a main title + ax.set_xlabel(col) # Set text for the x axis + ax.set_ylabel('Numbers') # Set text for y axis + plt.show() + + +# plot_density_hist(auto_prices, num_cols, bins = 20, hist = True) +def plot_density_hist(datas, cols, bins=10, hist=False): + for col in cols: + sns.set_style("whitegrid") + sns.distplot(datas[col], bins=bins, rug=True, hist=hist) + plt.title('Histogram of ' + col) # Give the plot a main title + plt.xlabel(col) # Set text for the x axis + plt.ylabel('Numbers') # Set text for y axis + plt.show() + + +# plot_scatter(auto_prices, ['horsepower'], 'engine_size') +# num_cols = ['curb_weight', 'engine_size', 'horsepower', 'city_mpg'] +# plot_scatter(auto_prices, num_cols) +def plot_scatter(datas, cols, col_y='price'): + for col in cols: + fig = plt.figure(figsize=(7, 6)) # define plot area + ax = fig.gca() # define axis + datas.plot.scatter(x=col, y=col_y, ax=ax) + ax.set_title('Scatter plot of ' + col_y + ' vs. ' + col) # Give the plot a main title + ax.set_xlabel(col) # Set text for the x axis + ax.set_ylabel(col_y) # Set text for y axis + plt.show() + + +# plot_desity_2d(auto_prices, num_cols) +# plot_desity_2d(auto_prices, num_cols, kind = 'hex') +def plot_desity_2d(datas, cols, col_y='price', kind='kde'): + for col in cols: + sns.set_style("whitegrid") + sns.jointplot(col, col_y, data=datas, kind=kind) + plt.xlabel(col) # Set text for the x axis + plt.ylabel(col_y) # Set text for y axis + plt.show() + + +# cat_cols = ['fuel_type', 'aspiration', 'num_of_doors', 'body_style', +# 'drive_wheels', 'engine_location', 'engine_type', 'num_of_cylinders'] +# plot_box(auto_prices, cat_cols) +def plot_box(datas, cols, col_y='price'): + for col in cols: + sns.set_style("whitegrid") + sns.boxplot(col, col_y, data=datas) + plt.xlabel(col) # Set text for the x axis + plt.ylabel(col_y) # Set text for y axis + plt.show() + + +# plot_violin(auto_prices, cat_cols) +def plot_violin(datas, cols, col_y='price'): + for col in cols: + sns.set_style("whitegrid") + sns.violinplot(col, col_y, data=datas) + plt.xlabel(col) # Set text for the x axis + plt.ylabel(col_y) # Set text for y axis + plt.show() + + +# num_cols = ['curb_weight', 'engine_size', 'horsepower', 'city_mpg'] +# plot_scatter_shape(auto_prices, num_cols) +def plot_scatter_shape(datas, cols, shape_col='fuel_type', col_y='price', alpha=0.2): + shapes = ['+', 'o', 's', 'x', '^'] # pick distinctive shapes + unique_cats = datas[shape_col].unique() + for col in cols: # loop over the columns to plot + sns.set_style("whitegrid") + for i, cat in enumerate(unique_cats): # loop over the unique categories + temp = datas[datas[shape_col] == cat] + sns.regplot(col, col_y, data=temp, marker=shapes[i], label=cat, + scatter_kws={"alpha": alpha}, fit_reg=False, color='blue') + plt.title('Scatter plot of ' + col_y + ' vs. ' + col) # Give the plot a main title + plt.xlabel(col) # Set text for the x axis + plt.ylabel(col_y) # Set text for y axis + plt.legend() + plt.show() + + +# num_cols = ['engine_size', 'horsepower', 'city_mpg'] +# plot_scatter_size(auto_prices, num_cols) +def plot_scatter_size(datas, cols, shape_col='fuel_type', size_col='curb_weight', + size_mul=0.000025, col_y='price', alpha=0.2): + shapes = ['+', 'o', 's', 'x', '^'] # pick distinctive shapes + unique_cats = datas[shape_col].unique() + for col in cols: # loop over the columns to plot + sns.set_style("whitegrid") + for i, cat in enumerate(unique_cats): # loop over the unique categories + temp = datas[datas[shape_col] == cat] + sns.regplot(col, col_y, data=temp, marker=shapes[i], label=cat, + scatter_kws={"alpha": alpha, "s": size_mul * temp[size_col] ** 2}, + fit_reg=False, color='blue') + plt.title('Scatter plot of ' + col_y + ' vs. ' + col) # Give the plot a main title + plt.xlabel(col) # Set text for the x axis + plt.ylabel(col_y) # Set text for y axis + plt.legend() + plt.show() + + +# num_cols = ['engine_size', 'horsepower', 'city_mpg'] +# plot_scatter_shape_size_col(auto_prices, num_cols) +def plot_scatter_shape_size_col(datas, cols, shape_col='fuel_type', size_col='curb_weight', + size_mul=0.000025, color_col='aspiration', col_y='price', alpha=0.2): + shapes = ['+', 'o', 's', 'x', '^'] # pick distinctive shapes + colors = ['green', 'blue', 'orange', 'magenta', 'gray'] # specify distinctive colors + unique_cats = datas[shape_col].unique() + unique_colors = datas[color_col].unique() + for col in cols: # loop over the columns to plot + sns.set_style("whitegrid") + for i, cat in enumerate(unique_cats): # loop over the unique categories + for j, color in enumerate(unique_colors): + temp = datas[(datas[shape_col] == cat) & (datas[color_col] == color)] + sns.regplot(col, col_y, data=temp, marker=shapes[i], + scatter_kws={"alpha": alpha, "s": size_mul * temp[size_col] ** 2}, + label=(cat + ' and ' + color), fit_reg=False, color=colors[j]) + plt.title('Scatter plot of ' + col_y + ' vs. ' + col) # Give the plot a main title + plt.xlabel(col) # Set text for the x axis + plt.ylabel(col_y) # Set text for y axis + plt.legend() + plt.show() + + +# plot_violin_hue(auto_prices, cat_cols) +def plot_violin_hue(datas, cols, col_y='price', hue_col='aspiration'): + for col in cols: + sns.set_style("whitegrid") + sns.violinplot(col, col_y, data=datas, hue=hue_col, split=True) + plt.xlabel(col) # Set text for the x axis + plt.ylabel(col_y) # Set text for y axis + plt.show() + + +# num_cols = ["curb_weight", "engine_size", "horsepower", "city_mpg", "price", "fuel_type"] +def plot_scatter_pairplot(datas, num_cols): + sns.pairplot(datas[num_cols], + hue='fuel_type', + palette="Set2", + diag_kind="kde", + size=2).map_upper(sns.kdeplot, cmap="Blues_d") + + +## Define columns for making a conditioned histogram +# plot_cols2 = ["length", +# "curb_weight", +# "engine_size", +# "city_mpg", +# "price"] +# +# cond_hists(auto_prices, plot_cols2, 'drive_wheels') +## Function to plot conditioned histograms +def cond_hists(df, plot_cols, grid_col): + import matplotlib.pyplot as plt + import seaborn as sns + ## Loop over the list of columns + for col in plot_cols: + grid1 = sns.FacetGrid(df, col=grid_col) + grid1.map(plt.hist, col, alpha=.7) + return grid_col + + +def sigma3(x): + ''' + MBA智库对3σ原则的描述: + + σ代表标准差,μ代表均值 + + 样本数据服从正态分布的情况下 + + 数值分布在(μ-σ,μ+σ)中的概率为0.6826 + + 数值分布在(μ-2σ,μ+2σ)中的概率为0.9544 + + 数值分布在(μ-3σ,μ+3σ)中的概率为0.9974 + + 可以认为,Y 的取值几乎全部集中在(μ-3σ,μ+3σ)区间内,超出这个范围的可能性仅占不到0.3%。 + + https://www.guofei.site/2017/10/19/cleandata.html + ''' + x = pd.Series(x) + mean_ = x.mean() + std_ = x.std() + rules = (mean_ - 3 * std_> x) | (mean_ + 3 * std_ < x) + indx = x[rules].index + # 获取异常值 + # out = x[indx] + return indx + + +def eda_profile(data): + """ + ydata_profile + """ + from ydata_profiling import ProfileReport + profile = ProfileReport(data, title="Profiling Report") + profile.to_file("data_analysis.html") + + +def eda_pgw(data): + import pygwalker as pyg + gwalker = pyg.walk(data) + + +## 日期相关操作 +def month_stage(x): + if x in range(1, 11): + return 0 # 上旬 + elif x in range(11, 21): + return 1 # 中旬 + else: + return 2 # 下旬 + + +# time +def time_feature(data, col): + """ + 对日期类型的列进行处理,分割出来更多的字段、特征、列 + """ + data['order_date'] = pd.to_datetime(data[col]) + data['dayofmonth'] = data[col].dt.day + data['dayofweek'] = data[col].dt.dayofweek + data['month'] = data[col].dt.month + data['year'] = data[col].dt.year + data['is_month_start'] = (data[col].dt.is_month_start).astype(int) + data['is_month_end'] = (data[col].dt.is_month_end).astype(int) + data['is_workday'] = (data[col].apply(lambda x: is_workday(x))).astype(int) + data['is_holiday'] = (data[col].apply(lambda x: is_holiday(x))).astype(int) + data['in_quarter'] = data[col].dt.quarter + data['in_month_stage'] = data['dayofmonth'].apply(month_stage) + return data + + +## + +def segments_bins_labels(datas, col, bins, labels): + """ + 按段为原来的 DataFrame 增加新的字段 + :param datas DataFrame + :param col 要处理的列 + :param bins 分割规则 + :param lables 分割后给的描述信息 + """ + + segments = pd.cut(datas[col], bins, labels) + datas['segements_' + col] = segments + return datas diff --git a/pandas_util/learning_curve.py b/pandas_util/learning_curve.py new file mode 100644 index 0000000..df3ee48 --- /dev/null +++ b/pandas_util/learning_curve.py @@ -0,0 +1,60 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.model_selection import learning_curve +import matplotlib.pyplot as plt +import matplotlib as mpl +zhfont = mpl.font_manager.FontProperties(fname='/usr/share/fonts/truetype/liberation/simhei.ttf') +# 用sklearn的learning_curve得到training_score和cv_score,使用matplotlib画出learning curve + +def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, + train_sizes=np.linspace(.05, 1., 20), verbose=0, plot=True,zhfont = zhfont): + """ + 画出data在某模型上的learning curve. + 参数解释 + ---------- + estimator : 你用的分类器。 + title : 表格的标题。 + X : 输入的feature,numpy类型 + y : 输入的target vector + ylim : tuple格式的(ymin, ymax), 设定图像中纵坐标的最低点和最高点 + cv : 做cross-validation的时候,数据分成的份数,其中一份作为cv集,其余n-1份作为training(默认为3份) + n_jobs : 并行的的任务数(默认1) + """ + train_sizes, train_scores, test_scores = learning_curve( + estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose=verbose) + + train_scores_mean = np.mean(train_scores, axis=1) + train_scores_std = np.std(train_scores, axis=1) + test_scores_mean = np.mean(test_scores, axis=1) + test_scores_std = np.std(test_scores, axis=1) + + if plot: + plt.figure() + plt.title(title, fontproperties=zhfont) + if ylim is not None: + plt.ylim(*ylim) + plt.xlabel(u"训练样本数", fontproperties=zhfont) + plt.ylabel(u"得分", fontproperties=zhfont) + plt.gca().invert_yaxis() + plt.grid() + + plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, + alpha=0.1, color="b") + plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, + alpha=0.1, color="r") + plt.plot(train_sizes, train_scores_mean, 'o-', color="b", label=u"训练集上得分") + plt.plot(train_sizes, test_scores_mean, 'o-', color="r", label=u"交叉验证集上得分") + + plt.legend(loc="best",prop=zhfont) + + # plt.draw() + # plt.show() + plt.gca().invert_yaxis() + + midpoint = ((train_scores_mean[-1] + train_scores_std[-1]) + (test_scores_mean[-1] - test_scores_std[-1])) / 2 + diff = (train_scores_mean[-1] + train_scores_std[-1]) - (test_scores_mean[-1] - test_scores_std[-1]) + + if diff < 0.05: + print("目前 两条曲线的 Diff = ",diff," 看起来还不错!") + + return midpoint, diff \ No newline at end of file diff --git a/snowflake-gui/__init__.py b/snowflake-gui/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/snowflake-gui/exceptions.py b/snowflake-gui/exceptions.py new file mode 100644 index 0000000..5091380 --- /dev/null +++ b/snowflake-gui/exceptions.py @@ -0,0 +1,6 @@ +class InvalidSystemClock(Exception): + """ + 时钟回拨异常 + """ + pass + diff --git a/snowflake-gui/snowflake.py b/snowflake-gui/snowflake.py new file mode 100644 index 0000000..6e0bdfa --- /dev/null +++ b/snowflake-gui/snowflake.py @@ -0,0 +1,109 @@ +# https://www.cnblogs.com/oklizz/p/11865750.html + +# Twitter's Snowflake algorithm implementation which is used to generate distributed IDs. +# https://github.com/twitter-archive/snowflake/blob/snowflake-2010/src/main/scala/com/twitter/service/snowflake/IdWorker.scala + +import time +import logging + +from exceptions import InvalidSystemClock + + +# 64位ID的划分 +WORKER_ID_BITS = 5 +DATACENTER_ID_BITS = 5 +SEQUENCE_BITS = 12 + +# 最大取值计算 +MAX_WORKER_ID = -1 ^ (-1 << WORKER_ID_BITS) # 2**5-1 0b11111 +MAX_DATACENTER_ID = -1 ^ (-1 << DATACENTER_ID_BITS) + +# 移位偏移计算 +WOKER_ID_SHIFT = SEQUENCE_BITS +DATACENTER_ID_SHIFT = SEQUENCE_BITS + WORKER_ID_BITS +TIMESTAMP_LEFT_SHIFT = SEQUENCE_BITS + WORKER_ID_BITS + DATACENTER_ID_BITS + +# 序号循环掩码 +SEQUENCE_MASK = -1 ^ (-1 << SEQUENCE_BITS) + +# Twitter元年时间戳 +TWEPOCH = 1288834974657 + + +logger = logging.getLogger('flask.app') + + +class IdWorker(object): + """ + 用于生成IDs + """ + + def __init__(self, datacenter_id, worker_id, sequence=0): + """ + 初始化 + :param datacenter_id: 数据中心(机器区域)ID + :param worker_id: 机器ID + :param sequence: 其实序号 + """ + # sanity check + if worker_id> MAX_WORKER_ID or worker_id < 0: + raise ValueError('worker_id值越界') + + if datacenter_id> MAX_DATACENTER_ID or datacenter_id < 0: + raise ValueError('datacenter_id值越界') + + self.worker_id = worker_id + self.datacenter_id = datacenter_id + self.sequence = sequence + + self.last_timestamp = -1 # 上次计算的时间戳 + + def _gen_timestamp(self): + """ + 生成整数时间戳 + :return:int timestamp + """ + return int(time.time() * 1000) + + def get_id(self): + """ + 获取新ID + :return: + """ + timestamp = self._gen_timestamp() + + # 时钟回拨 + if timestamp < self.last_timestamp: + logging.error('clock is moving backwards. Rejecting requests until {}'.format(self.last_timestamp)) + raise InvalidSystemClock + + if timestamp == self.last_timestamp: + self.sequence = (self.sequence + 1) & SEQUENCE_MASK + if self.sequence == 0: + timestamp = self._til_next_millis(self.last_timestamp) + else: + self.sequence = 0 + + self.last_timestamp = timestamp + + new_id = ((timestamp - TWEPOCH) << TIMESTAMP_LEFT_SHIFT) | (self.datacenter_id << DATACENTER_ID_SHIFT) | \ + (self.worker_id << WOKER_ID_SHIFT) | self.sequence + return new_id + + def _til_next_millis(self, last_timestamp): + """ + 等到下一毫秒 + """ + timestamp = self._gen_timestamp() + while timestamp <= last_timestamp: + timestamp = self._gen_timestamp() + return timestamp + + +if __name__ == '__main__': + worker = IdWorker(1, 2, 0) + print(worker.get_id()) + + for i in range(10): + worker = IdWorker(1, 2, 0) + print(worker.get_id()) diff --git a/tests/pandas_test.py b/tests/pandas_test.py new file mode 100644 index 0000000..cc374d9 --- /dev/null +++ b/tests/pandas_test.py @@ -0,0 +1,12 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +from pandas_util.feature_analytis import * + +datas = pd.read_csv('../pandas_util/datas/second_hand_ house.csv') +# print(datas) + +housePrice = datas['房屋单价'] + +feature_distribution(housePrice) \ No newline at end of file

AltStyle によって変換されたページ (->オリジナル) /