機器學習與深度學習:威斯康辛州乳癌資料集分類
載入資料集與有用的套件。
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
載入資料集轉換為 DataFrame 和 Series 。 DataFrame 是特徵, Series 是目標值。 使用 isnull()
檢查是否有 NaN 。 測試資料集預設大小為 25%,亂數種子數和亂數狀態值都設為 1 。train_test_split
可以幫助建立訓練資料集和測試資料集。
data = load_breast_cancer(as_frame=True)
print(data.data.isnull().sum())
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
通過實驗 400 個估算器就獲得足夠的準確率。如果估算器增加,準確率反而會下跌。
clf_e = ExtraTreesClassifier(n_estimators=400, random_state=1)
clf_e.fit(X_train, y_train)
print("Accuracy on test data: {:.2f}".format(clf_e.score(X_test, y_test)))
scores = cross_val_score(clf_e, X, y)
print("ExtraTrees KFold Accuracy: {:.2f}".format(scores.mean()))
隨機森林的估算器樹木要比 ExtraTreesClassifier 要多。
clf_r = RandomForestClassifier(n_estimators=600, random_state=1)
clf_r.fit(X_train, y_train)
print("Accuracy on test data: {:.2f}".format(clf_r.score(X_test, y_test)))
scores = cross_val_score(clf_r, X, y)
print("Random Forest KFold Accuracy: {:.2f}".format(scores.mean()))
ExtraTreesClassifier 有一個屬性,叫做 feature_importances_
。使用遞減排序並且收集數值大於等於 0.01 的特徵。獲得前 K 大重要的特徵。
indices_all = []
indices_e = []
extra_importance_sorted_idx = np.argsort(clf_e.feature_importances_)[::-1]
for idx in zip(extra_importance_sorted_idx):
print("%-30s %.8f" % (data.feature_names[idx],\
clf_e.feature_importances_[idx]))
if clf_e.feature_importances_[idx] >= 0.01:
indices_e.append([data.feature_names[idx],\
idx])
indices_all.append([data.feature_names[idx],\
idx])
隨機森林有一個屬性,叫做 feature_importances_
。處理的過程是一樣的。使用降序排序並且收集分數大於等於 0.01 的特徵。
indices_r = []
forest_importance_sorted_idx = np.argsort(clf_r.feature_importances_)[::-1]
for idx in zip(forest_importance_sorted_idx):
print("%-30s %.8f" % (data.feature_names[idx],\
clf_r.feature_importances_[idx]))
if clf_r.feature_importances_[idx] >= 0.01:
indices_r.append([data.feature_names[idx],\
idx])
降維:這裡結合兩個重要特徵清單。使用 filter
指令比較相同的特徵名稱。這新的資料集有 21 個特徵,原來的資料集有 30 個特徵。
indices_list = []
match = ""
for idx in range(len(indices_e)):
indices_list += [indices_e[idx][0]]
for idx in range(len(indices_r)):
match = list(filter(lambda x: indices_r[idx][0] in x, indices_list))
if len(match) > 0:
pass
#print("Duplicate!")
else:
indices_list += [indices_r[idx][0]]
print(f"Feature Name: {indices_list}")
print(f"Feature Total: {len(indices_list)}")
將特徵名稱轉換為特徵索引。 創建訓練集和測試集。
indices = []
for idx in range(len(indices_list)):
if indices_list[idx] in indices_all[idx][0]:
indices += indices_all[idx][1]
print(f"Feature indices: {indices}")
print(f"Feature Total: {len(indices)}")
X_f = pd.DataFrame(X.iloc[:,indices], columns=data.feature_names[indices]).values
X_train_f, X_test_f, y_train, y_test = train_test_split(X_f, y, random_state=1, stratify=y)
XGBClassifier 來自 XGBoost 套件。它是在機器學習領域中著名的套件。參數 eval_metric
設定為 error
。代表使用二元分類的規則。在混淆矩陣裡有 3 個 FP 和 4 個 FN 。只用到 21 個特徵。
xgb = XGBClassifier(use_label_encoder=False, eval_metric='error', seed=1)
xgb.fit(X_train_f, y_train)
y_pred = xgb.predict(X_test_f)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data: {:.2f}".format(accuracy))
scores = cross_val_score(xgb, X, y)
print("XGBClassifier KFold Accuracy: {:.2f}".format(scores.mean()))
print("Report:\n", metrics.classification_report(y_test, y_pred))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))
GridSearchCV 是一種暴力調整超參數的方法。執行數個小時,這 GridSearchCV 會自動給出建議的超參數數值。經過手動的些微調整,混淆矩陣內有 2 個 FP 和 4 個 FN ,只使用 21 個特徵。
param_test = {
#'max_depth':[i for i in range(5,13,2)],
#'min_child_weight':[i/10.0 for i in range(5,20,2)],
#'gamma':[i/10.0 for i in range(-2,7,2)],
#'subsample':[i/10.0 for i in range(4,11,2)],
#'colsample_bytree':[i/10.0 for i in range(4,11,2)],
#'reg_alpha':[0, 1e-2, 0.1, 1],
#'reg_lambda':[0, 1e-2, 0.1, 1]
}
from sklearn.model_selection import GridSearchCV
gsearch = GridSearchCV(
estimator = XGBClassifier(
base_score=0.5,
booster='gbtree',
colsample_bylevel=1,
colsample_bynode=1,
colsample_bytree=1,
eval_metric='error',
gamma=0.2,
gpu_id=-1,
importance_type='gain',
interaction_constraints='',
learning_rate=0.300000012,
max_delta_step=0,
max_depth=5,
min_child_weight=1.5,
missing=np.nan,
monotone_constraints='()',
n_estimators=100,
n_jobs=-1,
num_parallel_tree=1,
random_state=1,
reg_alpha=0,
reg_lambda=1,
scale_pos_weight=1,
subsample=0.8,
tree_method='exact',
use_label_encoder=False,
validate_parameters=1,
verbosity=None,
objective= 'binary:logistic',
seed=1),
param_grid = param_test,
scoring='accuracy',
n_jobs=-1,
cv=10)
gsearch.fit(X_train_f,y_train)
print("="*40)
print("gsearch.scorer_\t", gsearch.scorer_)
print("gsearch.best_params_\t", gsearch.best_params_)
print("gsearch.best_score_\t", gsearch.best_score_)
xgb_a = gsearch.best_estimator_
xgb_a.fit(X_train_f,y_train)
y_pred = xgb_a.predict(X_test_f)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data: {:.2f}".format(accuracy))
scores = cross_val_score(xgb_a, X, y)
print("XGBClassifier KFold Accuracy: {:.2f}".format(scores.mean()))
print("Report:\n", metrics.classification_report(y_test, y_pred))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))
將演算法更改為支持向量機。 通過網格搜索調整超參數。
這裡使用所有特徵和使用 21 個特徵具有相同的結果。
pipe_svc = make_pipeline(StandardScaler(),
SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'svc__C': param_range,
'svc__kernel': ['linear']},
{'svc__C': param_range,
'svc__gamma': param_range,
'svc__kernel': ['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,
param_grid=param_grid,
scoring='accuracy',
cv=10,
n_jobs=-1)
gs = gs.fit(X_train_f, y_train)
print(gs.best_score_)
print(gs.best_params_)
clf_s = gs.best_estimator_
clf_s.fit(X_train_f, y_train)
print('Test accuracy: %.3f' % clf_s.score(X_test_f, y_test))
1 FP 和 3 FN. KFold 準確率是 98%.
y_pred = clf_s.predict(X_test_f)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data: {:.2f}".format(accuracy))
scores = cross_val_score(pipe_svc, X, y, cv=10)
print("Scores\n", scores)
print("SVMClassifier KFold Accuracy: {:.2f}".format(scores.mean()))
print("Report:\n", metrics.classification_report(y_test, y_pred))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))
讓我們看看哪裡被錯誤分類。
plt.scatter(X_test[y_test == 0]["mean radius"], X_test[y_test == 0]["mean texture"],\
color="red", marker="^", alpha=0.5)
plt.scatter(X_test[y_test == 1]["mean radius"], X_test[y_test == 1]["mean texture"],\
color="green", marker="s", alpha=0.5)
plt.scatter(X_test[y_test != y_pred]["mean radius"], X_test[y_test != y_pred]["mean texture"],\
color="black", marker="x", s=1000, alpha=0.5, linewidth=2.0)
plt.show()
深度學習:威斯康辛州乳癌資料集分類¶
這程式只適用於 Tensorflow 1.15.x 以下。因此,檢查 Tensorflow 和 Keras 的版本。
import tensorflow as tf
print(f"Tensorflow: {tf.__version__}")
import keras
print(f"Keras: {keras.__version__}")
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
np.random.seed(1)
from tensorflow import set_random_seed
set_random_seed(1)
只使用 21 個特徵。並且用 StandardScaler
來為特徵值做標準化,這裡可以獲得標準化後的訓練集及測試集。要使用 LSTM 層,就必須將資料集轉換為 3 維陣列。
sc = StandardScaler()
sc.fit(X_f)
X_train_std = sc.transform(X_train_f)
X_test_std = sc.transform(X_test_f)
X_train_std = np.reshape(X_train_std, (X_train_std.shape[0], X_train_std.shape[1], 1))
X_test_std = np.reshape(X_test_std, (X_test_std.shape[0], X_test_std.shape[1], 1))
這裡有四層的 LSTM ,又叫堆疊 LSTM。一些研究人員發現越深的網路比一個大的神經元來的好。它的優勢是深的網路有高效能集高準確率。除此之外它能夠減少神經元數量。
ann = Sequential()
ann.add(LSTM(units=42, return_sequences=True, input_shape=(X_train_std.shape[1], 1)))
ann.add(Dropout(0.5))
ann.add(LSTM(units=42, return_sequences=True))
ann.add(Dropout(0.5))
ann.add(LSTM(units=42, return_sequences=True))
ann.add(Dropout(0.5))
ann.add(LSTM(units=42))
ann.add(Dropout(0.5))
ann.add(Dense(units=210,
kernel_initializer='normal',
activation='sigmoid'))
ann.add(Dropout(0.5))
ann.add(Dense(units=105,
kernel_initializer='normal',
activation='sigmoid'))
ann.add(Dropout(0.5))
ann.add(Dense(units=63,
kernel_initializer='normal',
activation='sigmoid'))
ann.add(Dropout(0.5))
ann.add(Dense(units=1,
kernel_initializer='normal',
activation='sigmoid'))
ann.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
history = ann.fit(x = X_train_std,
y = y_train,
validation_split = 0.2,
epochs = 25,
batch_size = 42, verbose = 2)
def show_history(count, history, train, validation):
plt.plot(range(1, count), history.history[train])
plt.plot(range(1, count), history.history[validation])
plt.title('Train history')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.xticks(range(1, count))
plt.legend(['train', 'validation'], loc='best')
plt.show()
因為訓練線高過於驗證線,似乎有點過擬合。因此,提早中斷訓練,只跑 25 回合。
show_history(26, history, 'acc', 'val_acc')
相同的邏輯,訓練線低於驗證線。似乎會過擬合。因此,提早中斷訓練,只跑 25 回合。
show_history(26, history, 'loss', 'val_loss')
用測試數據集獲得 95.1% 的準確率。
scores = ann.evaluate(x = X_test_std,
y = y_test)
print(f"Loss: {scores[0]}, Accuracy: {scores[1]}")
在測試數據集中有 3 FP 和 4 FN 。
y_pred = ann.predict_classes(X_test_std)
y_pred = y_pred.reshape(y_test.shape)
print("Report:\n", metrics.classification_report(y_test, y_pred))
pd.crosstab(y_test, y_pred)
Comments
Post a Comment