Principal Component Analysis (PCA): feature selection & reduction¶

Represenation of MNIST data in 2 dimension space.

In [1]:
from sklearn.datasets import load_digits # Loading MNIST data using the function "load_digits" provided by sklearn package
from sklearn.decomposition import PCA    # Importing PCA module - actually we do not need this because we are going to use Singular Value Decomposition (SVD)
import numpy as np                       # To use "SVD" function provided by numpy module
import matplotlib.pylab as plt           # Matplot is a library to plot a graph

Data Loading¶

In [2]:
digits = load_digits()                      # Loading MNIST data: each digit has 8 by 8 dimension
data, label = digits.data, digits.target    # Seperate the data into two parts: data - its label

print(data.shape)                           # Print the shape of data
print(data[0], label[0])                    # First data and its label
print(data[-1], label[-1])                  # Last data and its label
(1797, 64)
[ 0.  0.  5. 13.  9.  1.  0.  0.  0.  0. 13. 15. 10. 15.  5.  0.  0.  3.
 15.  2.  0. 11.  8.  0.  0.  4. 12.  0.  0.  8.  8.  0.  0.  5.  8.  0.
  0.  9.  8.  0.  0.  4. 11.  0.  1. 12.  7.  0.  0.  2. 14.  5. 10. 12.
  0.  0.  0.  0.  6. 13. 10.  0.  0.  0.] 0
[ 0.  0. 10. 14.  8.  1.  0.  0.  0.  2. 16. 14.  6.  1.  0.  0.  0.  0.
 15. 15.  8. 15.  0.  0.  0.  0.  5. 16. 16. 10.  0.  0.  0.  0. 12. 15.
 15. 12.  0.  0.  0.  4. 16.  6.  4. 16.  6.  0.  0.  8. 16. 10.  8. 16.
  8.  0.  0.  1.  8. 12. 14. 12.  1.  0.] 8

Group data in terms of their labels¶

In [3]:
from enum import Enum

label_ = Enum('label_', {'ZERO': 0, 'ONE': 1, 'TWO': 2, 'THREE': 3, 'FOUR': 4,
                         'FIVE': 5, 'SIX': 6, 'SEVEN': 7,'EIGHT': 8,'NINE': 9})

data_ = [np.empty(0) for _ in range(len(label_))]

for d, l in zip(data, label):
    data_[l] = np.append(data_[l], d)

print (data_[label_.EIGHT.value].shape)                            # See the shape of the data
print (np.reshape(data_[label_.EIGHT.value], (-1, 8*8)).shape)     # See the shape of the data
(11136,)
(174, 64)

Plotting one sample in each data set¶

In [4]:
fig, ax = plt.subplots(2, 5, figsize=(10, 5))
for item in label_:
    plt.subplot(2,5,item.value+1)
    plt.imshow(np.reshape(np.reshape(data_[item.value], (-1, 8*8))[0,:],[8,8]))

Dimension reduction of all data from 64 to 2¶

Then, we can plot them (all data) in 2 dimension space

In [5]:
#############################################################################
# First: get the mean of the data set
mean = [np.mean(data[:,i]) for i in range (64)]

#############################################################################
# Second: get the principal components of all data
for i in range(64):
    normal = [(x - mean[i]) for x in data[:,i]]
    data[:, i] = normal

#############################################################################
# X = USV
# XV^T = USVV^T
# XV^T = US (equal to rotated data)

K = 2  # Two principal components are used

# Singular Value Decomposition
U, S, V = np.linalg.svd(data)

# X = USV
# X*transpose(V) = US  # V*transpose(V) = I
# X_rot          = US
# X_rot: rotated data which equivalent to "np.dot(X, np.transpose(V))"
X_rot = np.dot(U[:,:K], np.eye(K) * S[:K])

# We are taking only the first two columns which represent
# the first and the second principal components.
plt.figure(figsize=(6,4))
plt.plot(X_rot[:,0], X_rot[:,1], 'y.')
Out[5]:
[<matplotlib.lines.Line2D at 0x7febf522fe20>]

Let's plot two data sets; number three and four¶

1) Data points with blue triangle show the number three 2) Data points with red circle show the number four

  • Now you can classify the data points with much confidence by drawing a line!
In [6]:
# plotting all data: yellow dot
plt.figure(figsize=(6,4))
plt.plot(X_rot[:,0], X_rot[:,1], 'y.')

# plotting data which we want to see
#data_three = []
#data_four  = []
for item, l in zip(X_rot,label):
    # Plotting if the lable is 3
    if l == 3:
        plt.plot(item[0], item[1], 'ro')
        #data_three.append([item[0], item[1], l])
    # Plotting if the lable is 3
    if l == 4:
        plt.plot(item[0], item[1], 'b^')
        #data_four.append([item[0], item[1], l])
In [7]:
print(len(X_rot))
1797

Support Vector Machine (SVM): classification of the data¶

In [8]:
from sklearn import svm

def make_meshgrid(x, y, h=.02):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on
    y: data to base y-axis meshgrid on
    h: stepsize for meshgrid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = min(x) - 1, max(x) + 1
    y_min, y_max = min(y) - 1, max(y) + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    ax: matplotlib axes object
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    params: dictionary of params to pass to contourf, optional
    """
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

# Just take the first two principal components for classification of the data
data_2D   = []
its_label = []
for item, L in zip(X_rot, label):
    if L == 3 or L == 4:
        data_2D.append([item[0], item[1]])
        its_label.append(L)

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
models = (svm.SVC(kernel='linear'),
          svm.SVC(kernel='poly', degree=3),
          svm.SVC(kernel='rbf', gamma=0.7))
models = (clf.fit(data_2D, its_label) for clf in models)

# title for the plots
titles = ('Linear kernel',
          'Polynomial kernel',
          'RBF kernel')

# Set-up 2x2 grid for plotting.
fig, sub = plt.subplots(1, 3)
fig.set_figheight(4)
fig.set_figwidth(12)

PC1 = [i[0] for i in data_2D]
PC2 = [i[1] for i in data_2D]

xx, yy = make_meshgrid(PC1, PC2)

for clf, title, ax in zip(models, titles, sub.flatten()):
    plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
    ax.scatter(PC1, PC2, c=its_label, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xlabel('1st Principal Component')
    ax.set_ylabel('2nd Principal Component')
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()
In [ ]: