load data

#%reset
import pickle
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

fin = 'training_data/merged_data.pkl'
with open(fin, 'rb') as fp:
    [xtr, xts, ytr, yts] = pickle.load(fp)

#Important: split training and validation data from original training data!
xtr, xval, ytr, yval = train_test_split(xtr, ytr, shuffle=True, test_size=0.20)

standardize the training and test data

#standardize
from sklearn.preprocessing import scale
xtr_scale = scale(xtr)
xts_scale = scale(xts)
xval_scale = scale(xval)

SVM classifier

from sklearn import svm

def fit_SVM(svc, kernel, xtr, ytr, xval, yval):
    
    # Fit the data        
    svc.fit(xtr,ytr)

    # Predict 
    yhat = svc.predict(xval)
    acc = np.mean(yhat == yval)
 
    return acc
        

Kernel: linear

C_test = [0.1,1,10,100]  

nC = len(C_test)
acc1 = np.zeros(nC)
kernel = "linear"
print("kernel: %s" %kernel)
# Measure and print the accuracy for each C value.
for i, C in enumerate(C_test):
    # Create the SVM
    #this will failed to converge
    #svc1 = svm.LinearSVC(loss='hinge', C=C, multi_class='ovr', max_iter=10000, verbose=1)  
    svc1 = svm.SVC(probability=False,  kernel=kernel, C=C, verbose=1)
    acc1[i] = fit_SVM(svc1, kernel, xtr_scale, ytr, xval_scale, yval)
    print('C=%12.4e acc=%f' % (C,acc1[i]))  
    
kernel: linear
[LibSVM]C=  1.0000e-01 acc=0.870075
[LibSVM]C=  1.0000e+00 acc=0.872313
[LibSVM]C=  1.0000e+01 acc=0.873955
[LibSVM]C=  1.0000e+02 acc=0.873507
#evaluation
#find the highest score
print('The highest score of rbf kernel is: %0.4g' %np.max(acc1))
best_C = C_test[np.argmax(acc1)]
print('C for the best model is: %g' %best_C)
The highest score of rbf kernel is: 0.874
C for the best model is: 10
#retrain the data using the best model and save the best model
import pickle

#best_C=10
kernel = "linear"
svc = svm.SVC(probability=False,  kernel=kernel, C=best_C, verbose=1)
print(svc)
svc.fit(xtr_scale,ytr)
yhat = svc.predict(xts_scale)
acc = np.mean(yhat == yts)
print('acc=%0.4g' %acc)

with open( "svm_linear_model.pkl", "wb" ) as fp:
    pickle.dump([svc, xtr_scale, ytr, xts_scale, yts], fp)
    
SVC(C=10, kernel='linear', verbose=1)
[LibSVM]acc=0.8727

Linear SVM feature selection

  • Feature importance
def f_importances(coef, names):
    imp = coef
    imp,names = zip(*sorted(zip(imp,names)))
    out = [imp,names]
    plt.barh(range(len(names)), imp, align='center')
    plt.yticks(range(len(names)), names)
    plt.xlabel('absolute weight')
    plt.ylabel('feature')
    plt.show()
    return out

    
features_names = ['1','2','3','4','5','6','7','8','9','10','11',
                  '12','13','14','15','16','17','18','19','20']
n_features = np.size(xtr_scale,1)
n_classes = len(np.unique(ytr))

clf_coef = np.abs(svc.coef_)
#print(clf_coef)

#call function
feature_imp0 = f_importances(clf_coef.sum(axis=0), features_names)

feature_imp = {}
key_name = feature_imp0[1]
for i, key in enumerate(key_name):
    feature_imp[key] = np.float(feature_imp0[0][i])
    
print(feature_imp)
{'14': 17.326699119538418, '12': 19.220412906912493, '11': 20.759248187715343, '20': 21.202262237548254, '19': 21.64898150546384, '18': 22.089297827139788, '3': 24.989935977300526, '13': 162.11064428220175, '8': 166.7677109430766, '1': 170.94155639360352, '7': 201.30511206388752, '4': 233.95847837354444, '15': 246.69017483537405, '2': 313.92746606696807, '10': 355.5232549718931, '16': 368.8027321206795, '9': 377.14263539979316, '17': 459.0395700345634, '5': 576.9777548498987, '6': 861.016594067992}
  • Feature selection (threshold: summation of abs(w) larger than 22)
selected_feature0 = [int(k) for k, v in feature_imp.items() if np.abs(v)>=22]
print('The selected features are:', selected_feature0)

#combining the feature correlatino and the results above, the final feature size should be recuded to 
selected_feature = [1,2,3,4,5,6,8,9,12,15,16,17]
xtr_select = xtr_scale[:, selected_feature]
xts_select = xts_scale[:, selected_feature]
The selected features are: [11, 20, 19, 18, 3, 13, 8, 1, 7, 4, 15, 2, 10, 16, 9, 17, 5, 6]
# training using the selected features (feature size reduced to 13)
kernel = "linear"
svc = svm.SVC(probability=False,  kernel=kernel, C=10, verbose=1)
print(svc)
svc.fit(xtr_select,ytr)
yhat = svc.predict(xts_select)
acc = np.mean(yhat == yts)
print('acc=%0.4g' %acc) 

with open( "svm_linear_feature_select_model.pkl", "wb" ) as fp:
    pickle.dump([svc, xtr_select, ytr, xts_select, yts], fp)
    
SVC(C=10, kernel='linear', verbose=1)
[LibSVM]acc=0.8742
# confusion matrix for the best model
from sklearn.metrics import confusion_matrix

CM = confusion_matrix(yts,yhat)
# Normalize the confusion matrix
CMsum = np.sum(CM,1)
CM = CM / CMsum[np.newaxis,:]

# Plot the confusion matrix
plt.imshow(CM, interpolation='none')
plt.colorbar()
plt.title('confusion matrix for SVM with linear kernel')
plt.xlabel('prediction')
plt.ylabel('true')

Text(0, 0.5, 'true')

Kernel: radial basis function

#training 
kernel = "rbf"
C_test = [1,10,100,500] 
gam_test = [0.001,0.01,0.1,0.5]  

nC = len(C_test)
ngam = len(gam_test)
acc2 = np.zeros((nC,ngam))
print("kernel: %s" %kernel)
# Measure and print the accuracy for each C and gamma value.  Store the results in acc
for i, C in enumerate(C_test):
    for j, gam in enumerate(gam_test):
        # Create the SVC
        svc2 = svm.SVC(probability=False,  kernel=kernel, C=C, gamma=gam,verbose=1)
        acc2[i,j] = fit_SVM(svc2, kernel, xtr_scale, ytr, xval_scale, yval)
        print('C=%12.4e gam=%12.4e acc=%f' % (C,gam,acc2[i,j]))   

kernel: rbf
[LibSVM]C=  1.0000e+00 gam=  1.0000e-03 acc=0.796567
[LibSVM]C=  1.0000e+00 gam=  1.0000e-02 acc=0.871940
[LibSVM]C=  1.0000e+00 gam=  1.0000e-01 acc=0.891791
[LibSVM]C=  1.0000e+00 gam=  5.0000e-01 acc=0.775821
[LibSVM]C=  1.0000e+01 gam=  1.0000e-03 acc=0.864403
[LibSVM]C=  1.0000e+01 gam=  1.0000e-02 acc=0.890672
[LibSVM]C=  1.0000e+01 gam=  1.0000e-01 acc=0.887463
[LibSVM]C=  1.0000e+01 gam=  5.0000e-01 acc=0.779701
[LibSVM]C=  1.0000e+02 gam=  1.0000e-03 acc=0.883358
[LibSVM]C=  1.0000e+02 gam=  1.0000e-02 acc=0.894925
[LibSVM]C=  1.0000e+02 gam=  1.0000e-01 acc=0.874403
[LibSVM]C=  1.0000e+02 gam=  5.0000e-01 acc=0.774701
[LibSVM]C=  5.0000e+02 gam=  1.0000e-03 acc=0.889254
[LibSVM]C=  5.0000e+02 gam=  1.0000e-02 acc=0.892910
[LibSVM]C=  5.0000e+02 gam=  1.0000e-01 acc=0.870896
[LibSVM]C=  5.0000e+02 gam=  5.0000e-01 acc=0.773358
#evaluation
#find the highest score
gam_grid, C_grid = np.meshgrid(gam_test, C_test)
print('The highest score of rbf kernel is: %0.3g' %np.max(acc2))
best_gam = gam_grid.reshape(-1)[np.argmax(acc2)]
best_C = C_grid.reshape(-1)[np.argmax(acc2)]
print('C for the best model is: %g' %best_C)
print('gamma for the best model is: %g' %best_gam)

The highest score of rbf kernel is: 0.895
C for the best model is: 100
gamma for the best model is: 0.01
#retrain the data using the best model and save the best model
kernel = "rbf"

svc = svm.SVC(probability=False, kernel=kernel, C=best_C, gamma=best_gam,verbose=1)
print(svc)
svc.fit(xtr_scale,ytr)
yhat = svc.predict(xts_scale)
acc = np.mean(yhat == yts)
print('acc=%0.4g' %acc) 

with open( "svm_rbf_model.pkl", "wb" ) as fp:
    pickle.dump([svc, xtr_scale, ytr, xts_scale, yts], fp)
    
SVC(C=100, gamma=0.01, verbose=1)
[LibSVM]acc=0.8955
# confusion matrix for the best model
CM = confusion_matrix(yts,yhat)
# Normalize the confusion matrix
CMsum = np.sum(CM,1)
CM = CM / CMsum[np.newaxis,:]

# Print the confusion matrix
#print(np.array_str(CM, precision=3, suppress_small=True))

# Plot the confusion matrix
plt.imshow(CM, interpolation='none')
plt.colorbar()
plt.title('confusion matrix for SVM with rbf kernel')
plt.xlabel('prediction')
plt.ylabel('true')
Text(0, 0.5, 'true')

Kernel: polynomial

kernel = "poly"
C_test = [1,10,100,500] 
deg_test = [2,3,4]

nC = len(C_test)
ndeg = len(deg_test)
acc3 = np.zeros((nC,ndeg))
print("kernel: %s" %kernel)
# Measure and print the accuracy for each C and gamma value.  Store the results in acc
for i, C in enumerate(C_test):
    for j, deg in enumerate(deg_test):
        # Create the SVC
        svc3 = svm.SVC(probability=False,  kernel=kernel, C=C, degree=deg, verbose=1)
        acc3[i,j] = fit_SVM(svc3, kernel, xtr_scale, ytr, xval_scale, yval)
        print('C=%12.4e deg=%12.4e acc=%f' % (C,deg,acc3[i,j]))   

kernel: poly
[LibSVM]C=  1.0000e+00 deg=  2.0000e+00 acc=0.837761
[LibSVM]C=  1.0000e+00 deg=  3.0000e+00 acc=0.829328
[LibSVM]C=  1.0000e+00 deg=  4.0000e+00 acc=0.714701
[LibSVM]C=  1.0000e+01 deg=  2.0000e+00 acc=0.866045
[LibSVM]C=  1.0000e+01 deg=  3.0000e+00 acc=0.872612
[LibSVM]C=  1.0000e+01 deg=  4.0000e+00 acc=0.814179
[LibSVM]C=  1.0000e+02 deg=  2.0000e+00 acc=0.869104
[LibSVM]C=  1.0000e+02 deg=  3.0000e+00 acc=0.873881
[LibSVM]C=  1.0000e+02 deg=  4.0000e+00 acc=0.837687
[LibSVM]C=  5.0000e+02 deg=  2.0000e+00 acc=0.868881
[LibSVM]C=  5.0000e+02 deg=  3.0000e+00 acc=0.865149
[LibSVM]C=  5.0000e+02 deg=  4.0000e+00 acc=0.829403
#evaluation
#find the highest score
deg_grid, C_grid = np.meshgrid(deg_test, C_test)
print('The highest accuracy of polynomial kernel is: %0.4g' %np.max(acc3))
best_deg = deg_grid.ravel()[np.argmax(acc3)]
best_C = C_grid.ravel()[np.argmax(acc3)]
print('C for the best model is: %g' %best_C)
print('degree for the best model is: %g' %best_deg)
The highest accuracy of polynomial kernel is: 0.8739
C for the best model is: 100
degree for the best model is: 3
#retrain the data using the best model and save the best model
kernel = "poly"

svc = svm.SVC(probability=False, kernel=kernel, C=best_C, degree=best_deg, verbose=1)
print(svc)
svc.fit(xtr_scale,ytr)
yhat = svc.predict(xts_scale)
acc = np.mean(yhat == yts)
print('acc=%0.4g' %acc) 

with open( "svm_poly_model.pkl", "wb" ) as fp:
    pickle.dump([svc, xtr_scale, ytr, xts_scale, yts], fp)
    
SVC(C=100, kernel='poly', verbose=1)
[LibSVM]acc=0.8734

Note that this is my lab assignment for Machine learning course!

Last update: 12/04/2020