Files
yola/pailuan/master/niaochanggui_test.ipynb
coco 85d885e008 a
2026-07-03 16:29:47 +08:00

644 lines
22 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 这是一个测试尿常规的算法文件"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd \n",
"import seaborn as sns\n",
"from IPython.display import display\n",
"import matplotlib.pyplot as plt\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"import sklearn\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"load data successful !!!!!\n"
]
}
],
"source": [
"try :\n",
" data_cre = pd.read_csv(\"14/data_cre.txt\")\n",
" \n",
" print (\"load data successful !!!!!\")\n",
"except :\n",
" print (\"load data error !!!!!!!!!!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#test only\n",
"data_cre['h'] = data_cre['h'].map(lambda x: x*2)\n",
"data_cre.to_csv('data_cre_2h.txt')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>h</th>\n",
" <th>s</th>\n",
" <th>v</th>\n",
" <th>l</th>\n",
" <th>a</th>\n",
" <th>b</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>13358.000000</td>\n",
" <td>13358.000000</td>\n",
" <td>13358.000000</td>\n",
" <td>13358.000000</td>\n",
" <td>13358.000000</td>\n",
" <td>13358.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>13.675999</td>\n",
" <td>90.980985</td>\n",
" <td>143.211633</td>\n",
" <td>131.036907</td>\n",
" <td>133.195763</td>\n",
" <td>145.954185</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>7.639799</td>\n",
" <td>13.948394</td>\n",
" <td>29.943108</td>\n",
" <td>33.892032</td>\n",
" <td>6.900436</td>\n",
" <td>9.115498</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>2.000000</td>\n",
" <td>46.000000</td>\n",
" <td>93.000000</td>\n",
" <td>74.000000</td>\n",
" <td>116.000000</td>\n",
" <td>132.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>6.000000</td>\n",
" <td>83.000000</td>\n",
" <td>118.000000</td>\n",
" <td>104.000000</td>\n",
" <td>128.000000</td>\n",
" <td>138.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>12.000000</td>\n",
" <td>92.000000</td>\n",
" <td>140.000000</td>\n",
" <td>124.000000</td>\n",
" <td>135.000000</td>\n",
" <td>144.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>20.000000</td>\n",
" <td>101.000000</td>\n",
" <td>167.000000</td>\n",
" <td>161.000000</td>\n",
" <td>139.000000</td>\n",
" <td>153.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>30.000000</td>\n",
" <td>128.000000</td>\n",
" <td>227.000000</td>\n",
" <td>215.000000</td>\n",
" <td>145.000000</td>\n",
" <td>168.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" h s v l a \\\n",
"count 13358.000000 13358.000000 13358.000000 13358.000000 13358.000000 \n",
"mean 13.675999 90.980985 143.211633 131.036907 133.195763 \n",
"std 7.639799 13.948394 29.943108 33.892032 6.900436 \n",
"min 2.000000 46.000000 93.000000 74.000000 116.000000 \n",
"25% 6.000000 83.000000 118.000000 104.000000 128.000000 \n",
"50% 12.000000 92.000000 140.000000 124.000000 135.000000 \n",
"75% 20.000000 101.000000 167.000000 161.000000 139.000000 \n",
"max 30.000000 128.000000 227.000000 215.000000 145.000000 \n",
"\n",
" b \n",
"count 13358.000000 \n",
"mean 145.954185 \n",
"std 9.115498 \n",
"min 132.000000 \n",
"25% 138.000000 \n",
"50% 144.000000 \n",
"75% 153.000000 \n",
"max 168.000000 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_cre.columns\n",
"train_labels = data_cre[\"index\"]\n",
"train_features = data_cre.drop(\"index\",axis=1)\n",
"\n",
"train_features.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 测试算法"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#test all those Ensemble Methods\n",
"\n",
"from sklearn.ensemble import AdaBoostClassifier\n",
"from sklearn.ensemble import AdaBoostRegressor\n",
"\n",
"from sklearn.ensemble import BaggingClassifier\n",
"from sklearn.ensemble import BaggingRegressor\n",
"\n",
"from sklearn.ensemble import ExtraTreesClassifier\n",
"from sklearn.ensemble import ExtraTreesRegressor\n",
"\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"\n",
"from sklearn.ensemble import IsolationForest\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.ensemble import RandomTreesEmbedding\n",
"\n",
"from sklearn.ensemble import StackingClassifier\n",
"from sklearn.ensemble import StackingRegressor\n",
"\n",
"from sklearn.ensemble import VotingClassifier\n",
"from sklearn.ensemble import VotingRegressor\n",
"\n",
"#from sklearn.ensemble import HistGradientBoostingRegressor\n",
"#from sklearn.ensemble import HistGradientBoostingClassifier\n",
"\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.metrics import f1_score\n",
"from sklearn.metrics import precision_score\n",
"from sklearn.metrics import recall_score\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"X_train ,X_test,y_train,y_test = train_test_split(train_features,train_labels,test_size = 0.3, random_state = 20)\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy_score : 0.9937624750499002\n",
"f1_score : [0.99785561 0.99711261 0.99713877 0.98428571 0.99352518]\n",
"precision_score: [1. 0.99424184 0.99856734 0.98428571 0.99281093]\n",
"recall_score : [0.9957204 1. 0.99571429 0.98428571 0.99424046]\n",
"runing time: 0:00:00.823796\n"
]
}
],
"source": [
"#case 1 AdaBoostClassifier \n",
"#https://www.cnblogs.com/pinard/p/6136914.html\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from datetime import datetime\n",
"bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=15, min_samples_split=20, min_samples_leaf=10),\n",
" n_estimators=50, learning_rate=0.8)\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"bdt.fit(X_train, y_train)\n",
"pred = bdt.predict(X_test)\n",
"print (\"accuracy_score :\" , accuracy_score(y_test,pred))\n",
"print (\"f1_score :\" , f1_score(y_test,pred,average=None))\n",
"print (\"precision_score:\" , precision_score(y_test,pred,average=None))\n",
"print (\"recall_score :\" , recall_score(y_test,pred,average=None))\n",
"\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from sklearn_porter import Porter\n",
"\n",
"porter_ABC_DC = Porter(bdt, language='c').export()\n",
"#porter_clf_svm_poly = Porter(clf_svm_poly, language='c').export()\n",
"# porter_clf_forest = Porter(clf_randomForest, language='c').export()\n",
"#porter_clf_extra_forest = Porter(clf_extra_forest, language='c').export()\n",
"\n",
"#print(porter_clf_svm_linear)\n",
"f = open(\"clf/porter_ABC_DC.cpp\",'wb')\n",
"#f = open(\"clf/clf_svm_linear_50features_20171207.txt\",'wb')\n",
"#f = open(\"clf_svm_linear_125100_low_feature_data.txt\",'wb')\n",
"f.write(porter_ABC_DC.encode())\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"d:\\Anaconda3\\lib\\site-packages\\sklearn\\externals\\joblib\\__init__.py:15: FutureWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.\n",
" warnings.warn(msg, category=FutureWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy_score : 0.9932634730538922\n",
"f1_score : [0.99785561 0.99711261 0.9964209 0.98285714 0.99316793]\n",
"precision_score: [1. 0.99424184 0.99856528 0.98285714 0.9920977 ]\n",
"recall_score : [0.9957204 1. 0.99428571 0.98285714 0.99424046]\n",
"runing time: 0:00:00.795870\n"
]
}
],
"source": [
"from sklearn.externals import joblib\n",
"joblib.dump(bdt, \"clf/porter_ABC_DC.pkl\")\n",
"bdt2=joblib.load('clf/porter_ABC_DC.pkl')\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"bdt2.fit(X_train, y_train)\n",
"pred = bdt2.predict(X_test)\n",
"print (\"accuracy_score :\" , accuracy_score(y_test,pred))\n",
"print (\"f1_score :\" , f1_score(y_test,pred,average=None))\n",
"print (\"precision_score:\" , precision_score(y_test,pred,average=None))\n",
"print (\"recall_score :\" , recall_score(y_test,pred,average=None))\n",
"\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9969600696026302\n",
"runing time: 0:00:00.637294\n"
]
}
],
"source": [
"#case 2 AdaBoostRegressor\n",
"#https://www.programcreek.com/python/example/86712/sklearn.ensemble.AdaBoostRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from datetime import datetime\n",
"bdt = AdaBoostRegressor(DecisionTreeRegressor(max_depth=15, min_samples_split=20, min_samples_leaf=10),\n",
" n_estimators=50, learning_rate=0.8)\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"bdt.fit(X_train, y_train)\n",
"pred = bdt.predict(X_test)\n",
"test_accuracy = bdt.score(X_test, y_test)\n",
"print(test_accuracy)\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"accuracy_score : 0.9885229540918163\n",
"f1_score : [0.9964209 0.99423077 0.99286733 0.9713877 0.98884491]\n",
"precision_score: [1. 0.99042146 0.99145299 0.97277937 0.98848921]\n",
"recall_score : [0.99286733 0.9980695 0.99428571 0.97 0.98920086]\n",
"runing time: 0:00:00.061815\n"
]
}
],
"source": [
"#case 3 BaggingClassifier \n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from datetime import datetime\n",
"bc = BaggingClassifier(DecisionTreeClassifier(max_depth=15, min_samples_split=20, min_samples_leaf=10),\n",
" max_samples=0.5,max_features=0.5)\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"bc.fit(X_train, y_train)\n",
"pred = bc.predict(X_test)\n",
"print (\"accuracy_score :\" , accuracy_score(y_test,pred))\n",
"print (\"f1_score :\" , f1_score(y_test,pred,average=None))\n",
"print (\"precision_score:\" , precision_score(y_test,pred,average=None))\n",
"print (\"recall_score :\" , recall_score(y_test,pred,average=None))\n",
"\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9969600696026302\n",
"runing time: 0:00:00.061832\n"
]
},
{
"data": {
"text/plain": [
"'\\nBagging通过降低基分类器的方差,改善了泛化误差\\n其性能依赖于基分类器的稳定性;如果基分类器不稳定,bagging有助于降低训练数据的随机波动导致的误差;如果稳定,则集成分类器的误差主要由基分类器的偏倚引起\\n由于每个样本被选中的概率相同,因此bagging并不侧重于训练数据集中的任何特定实例\\n'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#case 4 BaggingRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from datetime import datetime\n",
"br = BaggingRegressor(DecisionTreeRegressor(max_depth=15, min_samples_split=20, min_samples_leaf=10),\n",
" max_samples=0.5,max_features=0.5)\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"br.fit(X_train, y_train)\n",
"pred = br.predict(X_test)\n",
"print(bdt.score(X_test, y_test))\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))\n",
"'''\n",
"Bagging通过降低基分类器的方差,改善了泛化误差\n",
"其性能依赖于基分类器的稳定性;如果基分类器不稳定,bagging有助于降低训练数据的随机波动导致的误差;如果稳定,则集成分类器的误差主要由基分类器的偏倚引起\n",
"由于每个样本被选中的概率相同,因此bagging并不侧重于训练数据集中的任何特定实例\n",
"'''"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#case 5 ExtraTreesClassifier \n",
"from datetime import datetime\n",
"ec = ExtraTreesClassifier(n_estimators=50,max_depth=20,min_samples_leaf=50)\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"ec.fit(X_train, y_train)\n",
"pred = ec.predict(X_test)\n",
"print (\"accuracy_score :\" , accuracy_score(y_test,pred))\n",
"print (\"f1_score :\" , f1_score(y_test,pred,average=None))\n",
"print (\"precision_score:\" , precision_score(y_test,pred,average=None))\n",
"print (\"recall_score :\" , recall_score(y_test,pred,average=None))\n",
"\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#case 6 ExtraTreesRegressor \n",
"from datetime import datetime\n",
"er = ExtraTreesRegressor(n_estimators=50,max_depth=20,min_samples_leaf=50)\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"er.fit(X_train, y_train)\n",
"pred = er.predict(X_test)\n",
"print (\"score :\" , er.score(X_test, y_test))\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#case 7 RandomForestClassifier \n",
"from datetime import datetime\n",
"rfc = RandomForestClassifier(n_estimators=50,max_depth=20,min_samples_leaf=50)\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"rfc.fit(X_train, y_train)\n",
"pred = rfc.predict(X_test)\n",
"print (\"accuracy_score :\" , accuracy_score(y_test,pred))\n",
"print (\"f1_score :\" , f1_score(y_test,pred,average=None))\n",
"print (\"precision_score:\" , precision_score(y_test,pred,average=None))\n",
"print (\"recall_score :\" , recall_score(y_test,pred,average=None))\n",
"\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#case 8 RandomForestRegressor \n",
"from datetime import datetime\n",
"rfr = RandomForestRegressor(n_estimators=50,max_depth=20,min_samples_leaf=50)\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"rfr.fit(X_train, y_train)\n",
"print (\"score :\" , rfr.score(X_test, y_test))\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#case 9 C-Support Vector Classification.\n",
"from sklearn.svm import SVC\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"svc_linear = SVC(kernel = 'linear',C=0.1)\n",
"#svm linear accuracy score: 0.974885004599816\n",
"svc_linear.fit(X_train, y_train)\n",
"# pred = clf_svm_linear.predict(X_test)\n",
"# print \"svm linear accuracy score:\" , accuracy_score(y_test,pred)\n",
"# print \"f1 score:\" , f1_score(y_test,pred,average='micro')\n",
"pred = svc_linear.predict(X_test)\n",
"print (\"svm linear accuracy score:\" , accuracy_score(y_test,pred))\n",
"print (\"f1 score :\" , f1_score(y_test,pred,average=None))\n",
"print (\"precision_score:\" , precision_score(y_test,pred,average=None))\n",
"print (\"recall_score :\" , recall_score(y_test,pred,average=None))\n",
"#print(\"preds:\",pred[:10])\n",
"#print('trues:\\n',y_test[:10])\n",
"#print(\"\\n\")\n",
"\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#case 9 C-Support Vector Classification.\n",
"from sklearn.svm import SVR\n",
"\n",
"trarining_start_time = datetime.now()\n",
"\n",
"svr_linear = SVR(kernel = 'linear',C=0.1)\n",
"#svm linear accuracy score: 0.974885004599816\n",
"svr_linear.fit(X_train, y_train)\n",
"# pred = clf_svm_linear.predict(X_test)\n",
"# print \"svm linear accuracy score:\" , accuracy_score(y_test,pred)\n",
"# print \"f1 score:\" , f1_score(y_test,pred,average='micro')\n",
"pred = svr_linear.predict(X_test)\n",
"print (\"svm linear accuracy score:\" , accuracy_score(y_test,pred))\n",
"print (\"f1 score :\" , f1_score(y_test,pred,average=None))\n",
"print (\"precision_score:\" , precision_score(y_test,pred,average=None))\n",
"print (\"recall_score :\" , recall_score(y_test,pred,average=None))\n",
"print(\"preds:\",pred[:10])\n",
"print('trues:\\n',y_test[:10])\n",
"print(\"\\n\")\n",
"\n",
"training_stop_time = datetime.now()\n",
"print (\"runing time:\",(training_stop_time - trarining_start_time))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}