Iris Dataset
Load useful packages. Set random seed to 1.
%matplotlib inline
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import\
LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import cross_val_score
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn import metrics
from matplotlib.colors import ListedColormap
import numpy as np
np.random.seed(1)
Load Iris dataset. To check NaN. To choose petal length (cm) and petal width (cm) as features for our experiment.
data = load_iris(as_frame = True)
print(data.data.isnull().sum())
X = data.data.iloc[:,[2, 3]]
y = data.target
Standardization of all features.
sc = StandardScaler()
sc.fit(X)
X_std = sc.transform(X)
Using Linear Discriminant Analysis to separate all features.
lda = LDA(n_components = 2)
lda.fit(X_std, y)
X_std_lda = lda.transform(X_std)
Let us see the graph of the separation of features. Using three different colors to present the three kinds of flowers.
plt.scatter(X_std_lda[y == 0, 0], X_std_lda[y == 0, 1],\
color="red", marker="^", alpha=0.5)
plt.scatter(X_std_lda[y == 1, 0], X_std_lda[y == 1, 1],\
color="green", marker="s", alpha=0.5)
plt.scatter(X_std_lda[y == 2, 0], X_std_lda[y == 2, 1],\
color="blue", marker="o", alpha=0.5)
plt.show()
Because the Iris sample size is too small. Therefore, using a 30% test set.
X_train_std_lda, X_test_std_lda, y_train, y_test = train_test_split(\
X_std_lda, y, test_size = 0.3, stratify = y)
There has a misclassified in the test set. The algorithm is called "Support Vector Machine (SVM)". In the test set, there have overlap parts. Therefore, the kernel using "rbf" is able to deal with the non-linear decision boundary.
svc = SVC(kernel = "rbf", gamma = 1, C = 10, random_state = 1)
svc.fit(X_train_std_lda, y_train)
y_pred = svc.predict(X_test_std_lda)
print(f"misclassified: {(y_test != y_pred).sum()}")
from sklearn.metrics import accuracy_score
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
Found the misclassified sample and mark a big sign "X".
plt.scatter(X_std_lda[y == 0, 0], X_std_lda[y == 0, 1],\
color="red", marker="^", alpha=0.5)
plt.scatter(X_std_lda[y == 1, 0], X_std_lda[y == 1, 1],\
color="green", marker="s", alpha=0.5)
plt.scatter(X_std_lda[y == 2, 0], X_std_lda[y == 2, 1],\
color="blue", marker="o", alpha=0.5)
plt.scatter(X_test_std_lda[y_test != y_pred, 0], X_test_std_lda[y_test != y_pred, 1],\
color="black", marker="x", s=1000, alpha=0.5, linewidth=2.0)
plt.show()
Using KFold to evaluate the dataset, it obtained 97.3% +/- 0.013 accuracy.
scores = cross_val_score(estimator=svc,\
X=X_std_lda,\
y=y,\
n_jobs=-1)
print("CV accuracy scores: %s" % scores)
print("CV accuracy: %.3f +/- %.3f" % (np.mean(scores),\
np.std(scores)))
To check training set exists misclassified, there have a misclassified in the training set. Until now, there have been two misclassified in the dataset.
y_pred = svc.predict(X_train_std_lda)
print(f"Misclassified: {(y_train != y_pred).sum()}")
from sklearn.metrics import accuracy_score
print(f"Accuracy: {accuracy_score(y_train, y_pred)}")
There have two overlapped samples at the same point. The point has marked a big sign "X".
plt.scatter(X_std_lda[y == 0, 0], X_std_lda[y == 0, 1],\
color="red", marker="^", alpha=0.5)
plt.scatter(X_std_lda[y == 1, 0], X_std_lda[y == 1, 1],\
color="green", marker="s", alpha=0.5)
plt.scatter(X_std_lda[y == 2, 0], X_std_lda[y == 2, 1],\
color="blue", marker="o", alpha=0.5)
plt.scatter(X_train_std_lda[y_train != y_pred, 0], X_train_std_lda[y_train != y_pred, 1],\
color="black", marker="x", s=1000, alpha=0.5, linewidth=2.0)
plt.show()
The function is to plot the decision regions.
def plot_decision_regions(X, y, classifier, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
# plot class samples
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0],
y=X[y == cl, 1],
alpha=0.6,
color=cmap(idx),
edgecolor='black',
marker=markers[idx],
label=cl)
Number 0 and 1 regions are too small, this is overfitting.
plot_decision_regions(X_std_lda, y, classifier=svc)
plt.xlabel('X')
plt.ylabel('Y')
plt.legend(loc='lower left')
plt.tight_layout()
plt.show()
Now, to change the XGBClassifier of XGBoost. Because this is a multi-classification issue, therefore, the eval_metric
should be set to merror
. There has a misclassified in the test set. But the KFold accuracy has 98%, better than the SVM.
xgb = XGBClassifier(use_label_encoder=False, eval_metric='merror', seed=1)
xgb.fit(X_train_std_lda, y_train)
y_pred = xgb.predict(X_test_std_lda)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data: {:.2f}".format(accuracy))
scores = cross_val_score(xgb, X_std_lda, y)
print("XGBClassifier KFold Accuracy: {:.2f}".format(scores.mean()))
print("Report:\n", metrics.classification_report(y_test, y_pred))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred))
Next, to check the training set, there has a misclassified. But KFold accuracy has 98%, better than the SVM.
y_pred = xgb.predict(X_train_std_lda)
accuracy = accuracy_score(y_train, y_pred)
print("Accuracy on test data: {:.2f}".format(accuracy))
scores = cross_val_score(xgb, X_std_lda, y)
print("XGBClassifier KFold Accuracy: {:.2f}".format(scores.mean()))
print("Report:\n", metrics.classification_report(y_train, y_pred))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_train, y_pred))
Look at the decision region graph, it's fair to separate three regions. Fitting the dataset, this is suitable.
plot_decision_regions(X_std_lda, y, classifier=xgb)
plt.xlabel('X')
plt.ylabel('Y')
plt.legend(loc='lower left')
plt.tight_layout()
plt.show()
Comments
Post a Comment