Classification of Wisconsin breast cancer dataset
Loading dataset and useful packages.
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
Loading dataset as DataFrame and Series of the Pandas. The DataFrame is the feature, and the Series is a target value. Using isnull()
for check NaN. The default test size is 0.25 (25%). Random seed and Random state are both set 1. train_test_split
can help to build training set and test set.
data = load_breast_cancer(as_frame=True)
print(data.data.isnull().sum())
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
The 400 estimators got enough accuracy in the pass experiment. If the estimator increase, then accuracy will drop.
clf_e = ExtraTreesClassifier(n_estimators=400, random_state=1)
clf_e.fit(X_train, y_train)
print("Accuracy on test data: {:.2f}".format(clf_e.score(X_test, y_test)))
scores = cross_val_score(clf_e, X, y)
print("ExtraTrees KFold Accuracy: {:.2f}".format(scores.mean()))
The number of estimators in RandomForestClassifier needs more than ExtraTreesClassifier.
clf_r = RandomForestClassifier(n_estimators=600, random_state=1)
clf_r.fit(X_train, y_train)
print("Accuracy on test data: {:.2f}".format(clf_r.score(X_test, y_test)))
scores = cross_val_score(clf_r, X, y)
print("Random Forest KFold Accuracy: {:.2f}".format(scores.mean()))
ExtraTreesClassifier has an attribute, it's called feature_importances_
. Using descending sort and collecting the score greater and equal to 0.01. Got top K feature of important. At the same time, record whole feature into a list.
indices_all = []
indices_e = []
extra_importance_sorted_idx = np.argsort(clf_e.feature_importances_)[::-1]
for idx in zip(extra_importance_sorted_idx):
print("%-30s %.8f" % (data.feature_names[idx],\
clf_e.feature_importances_[idx]))
if clf_e.feature_importances_[idx] >= 0.01:
indices_e.append([data.feature_names[idx],\
idx])
indices_all.append([data.feature_names[idx],\
idx])
RandomForestClassifier has the attribute, the same name called feature_importances_
. The process routine is the same. Using descending sort and collecting greater equal to 0.01 score of the feature.
indices_r = []
forest_importance_sorted_idx = np.argsort(clf_r.feature_importances_)[::-1]
for idx in zip(forest_importance_sorted_idx):
print("%-30s %.8f" % (data.feature_names[idx],\
clf_r.feature_importances_[idx]))
if clf_r.feature_importances_[idx] >= 0.01:
indices_r.append([data.feature_names[idx],\
idx])
Dimensionality reduction: There are combining two feature lists of importance. Using the instruction filter
to compare the same feature name. The new dataset has 21 features and the original dataset has 30 features.
indices_list = []
match = ""
for idx in range(len(indices_e)):
indices_list += [indices_e[idx][0]]
for idx in range(len(indices_r)):
match = list(filter(lambda x: indices_r[idx][0] in x, indices_list))
if len(match) > 0:
pass
#print("Duplicate!")
else:
indices_list += [indices_r[idx][0]]
print(f"Feature Name: {indices_list}")
print(f"Feature Total: {len(indices_list)}")
To transform the feature name to feature indices. To create training set and test set.
indices = []
for idx in range(len(indices_list)):
if indices_list[idx] in indices_all[idx][0]:
indices += indices_all[idx][1]
print(f"Feature indices: {indices}")
print(f"Feature Total: {len(indices)}")
X_f = pd.DataFrame(X.iloc[:,indices], columns=data.feature_names[indices]).values
X_train_f, X_test_f, y_train, y_test = train_test_split(X_f, y, random_state=1, stratify=y)
The XGBClassifier is from XGBoost. It's a famous package in the machine learning field. The eval_metric
set error
. That means using the binary classification rules. There has 3 FP and 4 FN in the confusion matrix. Only using 21 features.
xgb = XGBClassifier(use_label_encoder=False, eval_metric='error', seed=1)
xgb.fit(X_train_f, y_train)
y_pred = xgb.predict(X_test_f)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data: {:.2f}".format(accuracy))
scores = cross_val_score(xgb, X, y)
print("XGBClassifier KFold Accuracy: {:.2f}".format(scores.mean()))
print("Report:\n", metrics.classification_report(y_test, y_pred))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))
The GridSearchCV is a brute force way of adjusting the hyperparameters. In several hours, the GridSearchCV gives the hyperparameters of recommendation automatically. Using manual to improve a few values, there has 2 FP and 4 FN in the confusion matrix. Only using 21 features.
param_test = {
#'max_depth':[i for i in range(5,13,2)],
#'min_child_weight':[i/10.0 for i in range(5,20,2)],
#'gamma':[i/10.0 for i in range(-2,7,2)],
#'subsample':[i/10.0 for i in range(4,11,2)],
#'colsample_bytree':[i/10.0 for i in range(4,11,2)],
#'reg_alpha':[0, 1e-2, 0.1, 1],
#'reg_lambda':[0, 1e-2, 0.1, 1]
}
from sklearn.model_selection import GridSearchCV
gsearch = GridSearchCV(
estimator = XGBClassifier(
base_score=0.5,
booster='gbtree',
colsample_bylevel=1,
colsample_bynode=1,
colsample_bytree=1,
eval_metric='error',
gamma=0.2,
gpu_id=-1,
importance_type='gain',
interaction_constraints='',
learning_rate=0.300000012,
max_delta_step=0,
max_depth=5,
min_child_weight=1.5,
missing=np.nan,
monotone_constraints='()',
n_estimators=100,
n_jobs=-1,
num_parallel_tree=1,
random_state=1,
reg_alpha=0,
reg_lambda=1,
scale_pos_weight=1,
subsample=0.8,
tree_method='exact',
use_label_encoder=False,
validate_parameters=1,
verbosity=None,
objective= 'binary:logistic',
seed=1),
param_grid = param_test,
scoring='accuracy',
n_jobs=-1,
cv=10)
gsearch.fit(X_train_f,y_train)
print("="*40)
print("gsearch.scorer_\t", gsearch.scorer_)
print("gsearch.best_params_\t", gsearch.best_params_)
print("gsearch.best_score_\t", gsearch.best_score_)
xgb_a = gsearch.best_estimator_
xgb_a.fit(X_train_f,y_train)
y_pred = xgb_a.predict(X_test_f)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data: {:.2f}".format(accuracy))
scores = cross_val_score(xgb_a, X, y)
print("XGBClassifier KFold Accuracy: {:.2f}".format(scores.mean()))
print("Report:\n", metrics.classification_report(y_test, y_pred))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))
Change an algorithm to a support vector machine. Tuning hyperparameters via grid search.
Here using all features and using 21 features have the same result.
pipe_svc = make_pipeline(StandardScaler(),
SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'svc__C': param_range,
'svc__kernel': ['linear']},
{'svc__C': param_range,
'svc__gamma': param_range,
'svc__kernel': ['rbf']}]
gs = GridSearchCV(estimator=pipe_svc,
param_grid=param_grid,
scoring='accuracy',
cv=10,
n_jobs=-1)
gs = gs.fit(X_train_f, y_train)
print(gs.best_score_)
print(gs.best_params_)
clf_s = gs.best_estimator_
clf_s.fit(X_train_f, y_train)
print('Test accuracy: %.3f' % clf_s.score(X_test_f, y_test))
1 FP and 3 FN. KFold accuracy is 98%.
y_pred = clf_s.predict(X_test_f)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data: {:.2f}".format(accuracy))
scores = cross_val_score(pipe_svc, X, y, cv=10)
print("Scores\n", scores)
print("SVMClassifier KFold Accuracy: {:.2f}".format(scores.mean()))
print("Report:\n", metrics.classification_report(y_test, y_pred))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))
Let's see where are misclassified.
plt.scatter(X_test[y_test == 0]["mean radius"], X_test[y_test == 0]["mean texture"],\
color="red", marker="^", alpha=0.5)
plt.scatter(X_test[y_test == 1]["mean radius"], X_test[y_test == 1]["mean texture"],\
color="green", marker="s", alpha=0.5)
plt.scatter(X_test[y_test != y_pred]["mean radius"], X_test[y_test != y_pred]["mean texture"],\
color="black", marker="x", s=1000, alpha=0.5, linewidth=2.0)
plt.show()
Deep Learning: Classification of Wisconsin breast cancer dataset¶
The code is only used under TensorFlow 1.15.x. Therefore, here is a check Tensorflow and Keras version.
import tensorflow as tf
print(f"Tensorflow: {tf.__version__}")
import keras
print(f"Keras: {keras.__version__}")
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
np.random.seed(1)
from tensorflow import set_random_seed
set_random_seed(1)
Only using 21 features. Using StandardScaler
for feature standardization, here is obtained training set and a test set of standardization. For the LSTM layer, the dataset has to transform into a 3D array.
sc = StandardScaler()
sc.fit(X_f)
X_train_std = sc.transform(X_train_f)
X_test_std = sc.transform(X_test_f)
X_train_std = np.reshape(X_train_std, (X_train_std.shape[0], X_train_std.shape[1], 1))
X_test_std = np.reshape(X_test_std, (X_test_std.shape[0], X_test_std.shape[1], 1))
There have 4 layers of the LSTM. It's called Stacked LSTM. Some researchers found a more deep network better than a large number of neurons. The advantage of the deep network is performance and accuracy highly. In addition, it is able to reduce the number of neurons.
ann = Sequential()
ann.add(LSTM(units=42, return_sequences=True, input_shape=(X_train_std.shape[1], 1)))
ann.add(Dropout(0.5))
ann.add(LSTM(units=42, return_sequences=True))
ann.add(Dropout(0.5))
ann.add(LSTM(units=42, return_sequences=True))
ann.add(Dropout(0.5))
ann.add(LSTM(units=42))
ann.add(Dropout(0.5))
ann.add(Dense(units=210,
kernel_initializer='normal',
activation='sigmoid'))
ann.add(Dropout(0.5))
ann.add(Dense(units=105,
kernel_initializer='normal',
activation='sigmoid'))
ann.add(Dropout(0.5))
ann.add(Dense(units=63,
kernel_initializer='normal',
activation='sigmoid'))
ann.add(Dropout(0.5))
ann.add(Dense(units=1,
kernel_initializer='normal',
activation='sigmoid'))
ann.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
history = ann.fit(x = X_train_std,
y = y_train,
validation_split = 0.2,
epochs = 25,
batch_size = 42, verbose = 2)
def show_history(count, history, train, validation):
plt.plot(range(1, count), history.history[train])
plt.plot(range(1, count), history.history[validation])
plt.title('Train history')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.xticks(range(1, count))
plt.legend(['train', 'validation'], loc='best')
plt.show()
Because the train line is higher than the validation line. That seems a little overfitting. Therefore, using early stop. Only 25 epochs.
show_history(26, history, 'acc', 'val_acc')
With the same logic, the train line is lowered than the validation line. That seems overfitting. Therefore, early stop and only 25 epochs.
show_history(26, history, 'loss', 'val_loss')
Here is obtained 95.1% accuracy in the test set.
scores = ann.evaluate(x = X_test_std,
y = y_test)
print(f"Loss: {scores[0]}, Accuracy: {scores[1]}")
There has 4 FP and 3 FN in the test set.
y_pred = ann.predict_classes(X_test_std)
y_pred = y_pred.reshape(y_test.shape)
print("Report:\n", metrics.classification_report(y_test, y_pred))
pd.crosstab(y_test, y_pred)
Comments
Post a Comment