Parallel Universe of Yuntao | Parameter Tuning for XBoost

Good references:
Sklearn GridSearchCV

In this demo, we will try to tune hyperparameters for XGBoost model sequentially using grid search rather than tune all hyperparameters at once.

Note:
Use model = xgb.XGBRegressor(**other_params) rather than model = xgb.XGBRegressor(other_params)!
Every time a parameter is tuned, the parameter corresponding to other_params must be updated to the optimal value!

Load data

#%reset
import numpy as np
import pickle 
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import xgboost as xgb
#import graphviz

fin = 'training_data/merged_data.pkl'
with open(fin, 'rb') as fp:
    [xtr, xts, ytr, yts] = pickle.load(fp)
    
#Important: split training and validation data from original training data!
xtr, xval, ytr, yval = train_test_split(xtr, ytr, test_size=0.20, random_state=42, shuffle=True)    

Defualt model traning and testing

xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42, eval_metric="auc", subsample=0.8)

xgb_model.fit(xtr, ytr, verbose=10)
y_pred = xgb_model.predict(xval)

print(xgb_model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

#accuracy
accuracy = accuracy_score(yval, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 89.45%

Find an optimize model

Gridsearch function

from sklearn.model_selection import GridSearchCV
import sklearn.model_selection

X = np.vstack((xtr,xval))
y = np.hstack((ytr,yval))
ntr = len(ytr)
nts = len(yval)
test_fold = np.hstack((-1*np.ones(ntr), np.zeros(nts)))
ps = sklearn.model_selection.PredefinedSplit(test_fold)

#hyper-parameters
Niter_test=[100, 150, 200, 400]
max_depth_test = [3,6,10,15]
lr_test = [0.01, 0.1, 0.2, 0.4]
gam_test = [0., 0.1, 0.2, 0.3]
reg_lambda_test = [1e-3, 1e-2, 0.1, 1, 10, 100]   #L2-norm

def gridsearch_cv(model, param_grid, X, y, ps, njobs): 
    clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=njobs, cv=ps, verbose=10) 
    print(clf)
    clf.fit(X, y) 
    #print('CV Results: ', clf.cv_results_) 
    #print('Best Params: ', clf.best_params_) 
    #print('Best Score: ', clf.best_score_) 
    return clf

Optimize n_estimators

n_estimators is the optimal number of iterations or the number of weak learners.

other_params = {'objective': 'multi:softprob', 'random_state': 42, 'eval_metric': 'auc', 'subsample': 0.8}
model = xgb.XGBClassifier(**other_params)
#print(model)
#model = xgb.XGBClassifier(objective='multi:softprob', eval_metric="auc", random_state=42, subsample=0.8)

cv_params = {'n_estimators': Niter_test}
njobs = 4
clf = gridsearch_cv(model, cv_params, X, y, ps, njobs)
print(clf.cv_results_)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, eval_metric='auc',
                                     gamma=None, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None,
                                     objective='multi:softprob',
                                     random_state=42, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=0.8, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             n_jobs=4, param_grid={'n_estimators': [100, 150, 200, 400]},
             verbose=10)
Fitting 1 folds for each of 4 candidates, totalling 4 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  3.7min
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  5.1min remaining:  5.1min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  9.1min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  9.1min finished


CV Results:  {'mean_fit_time': array([219.33839226, 299.2481606 , 362.04324269, 536.5468092 ]), 'std_fit_time': array([0., 0., 0., 0.]), 'mean_score_time': array([1.55304623, 2.2140727 , 3.15582633, 5.26072145]), 'std_score_time': array([0., 0., 0., 0.]), 'param_n_estimators': masked_array(data=[100, 150, 200, 400],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 100}, {'n_estimators': 150}, {'n_estimators': 200}, {'n_estimators': 400}], 'split0_test_score': array([0.89447761, 0.89544776, 0.89671642, 0.89746269]), 'mean_test_score': array([0.89447761, 0.89544776, 0.89671642, 0.89746269]), 'std_test_score': array([0., 0., 0., 0.]), 'rank_test_score': array([4, 3, 2, 1])}
Best Params:  {'n_estimators': 400}
Best Score:  0.8974626865671642

From the results below, we can see that there are the same score for model with n_estimators=200 and 400. Thus, we take the best n_estimators as 200.

Optimize max_depth

max_depth is the max depth of a tree. Increasing this value will make the model more complex and more likely to overfit.

other_params = {'objective': 'multi:softprob', 'random_state': 42, 'eval_metric': 'auc', 'subsample': 0.8, 
                'n_estimators':200}
model = xgb.XGBClassifier(**other_params)
#print(model)

cv_params = {'max_depth': max_depth_test}
njobs = 4
clf = gridsearch_cv(model, cv_params, X, y, ps, njobs)
print(clf.cv_results_)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, eval_metric='auc',
                                     gamma=None, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=200, n_jobs=None,
                                     num_parallel_tree=None,
                                     objective='multi:softprob',
                                     random_state=42, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=0.8, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             n_jobs=4, param_grid={'max_depth': [3, 6, 10, 15]}, verbose=10)
Fitting 1 folds for each of 4 candidates, totalling 4 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  6.3min remaining:  6.3min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  6.9min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  6.9min finished


{'mean_fit_time': array([250.310534  , 374.99056125, 404.42518497, 406.44820809]), 'std_fit_time': array([0., 0., 0., 0.]), 'mean_score_time': array([2.06299019, 3.03603101, 3.56867194, 2.97454596]), 'std_score_time': array([0., 0., 0., 0.]), 'param_max_depth': masked_array(data=[3, 6, 10, 15],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 3}, {'max_depth': 6}, {'max_depth': 10}, {'max_depth': 15}], 'split0_test_score': array([0.89626866, 0.89671642, 0.89880597, 0.89746269]), 'mean_test_score': array([0.89626866, 0.89671642, 0.89880597, 0.89746269]), 'std_test_score': array([0., 0., 0., 0.]), 'rank_test_score': array([4, 3, 1, 2])}

print('Best Params: ', clf.best_params_) 
print('Best Score: %0.4g' %clf.best_score_)

Best Params:  {'max_depth': 10}
Best Score: 0.8988

From the results below, we can see that maximum depth 10 gives the best score 0.8988.

Optimize gamma and reg_lambda

Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.

reg is a L2 regularization term on weights. Increasing this value will make model more conservative. Normalised to number of training examples.

other_params = {'objective': 'multi:softprob', 'random_state': 42, 'eval_metric': 'auc', 'subsample': 0.8, 
                'n_estimators':200, 'max_depth': 10}
model = xgb.XGBClassifier(**other_params)
#print(model)

cv_params = {'gamma': gam_test, 'reg_lambda': reg_lambda_test}
njobs = 4
clf = gridsearch_cv(model, cv_params, X, y, ps, njobs)

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, eval_metric='auc',
                                     gamma=None, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=10, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=200, n_jobs=None,
                                     num_parallel_tree=None,
                                     objective='multi:softprob',
                                     random_state=42, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=0.8, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             n_jobs=4,
             param_grid={'gamma': [0.0, 0.1, 0.2, 0.3],
                         'reg_lambda': [0.001, 0.01, 0.1, 1, 10, 100]},
             verbose=10)
Fitting 1 folds for each of 24 candidates, totalling 24 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed: 15.5min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed: 24.6min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed: 42.9min
[Parallel(n_jobs=4)]: Done  20 out of  24 | elapsed: 49.0min remaining:  9.8min
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed: 58.2min finished

print(clf.cv_results_)
print('Best Params: ', clf.best_params_) 
print('Best Score: %0.4g' %clf.best_score_)

{'mean_fit_time': array([404.28340697, 400.29925346, 401.53780079, 435.2927022 ,
       585.22974706, 713.03687501, 515.37082434, 514.35972023,
       488.64769959, 511.99894309, 586.02537799, 685.42307258,
       528.94938993, 529.09306049, 532.39049315, 554.98072243,
       617.56116581, 702.50300646, 559.10517311, 556.37984872,
       559.82931995, 554.74631739, 589.34720969, 545.09472537]), 'std_fit_time': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.]), 'mean_score_time': array([4.37557888, 4.66015029, 5.06264257, 4.09290457, 6.237679  ,
       9.16920233, 3.06602764, 3.12065005, 2.74554229, 3.0090003 ,
       5.09211946, 6.99369597, 2.73910427, 2.72157264, 2.6582725 ,
       2.91256833, 4.05202961, 5.6127677 , 2.80606484, 2.80110574,
       2.6645627 , 2.47576356, 2.82322264, 3.42771006]), 'std_score_time': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.]), 'param_gamma': masked_array(data=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.1,
                   0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3,
                   0.3, 0.3],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_reg_lambda': masked_array(data=[0.001, 0.01, 0.1, 1, 10, 100, 0.001, 0.01, 0.1, 1, 10,
                   100, 0.001, 0.01, 0.1, 1, 10, 100, 0.001, 0.01, 0.1, 1,
                   10, 100],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'gamma': 0.0, 'reg_lambda': 0.001}, {'gamma': 0.0, 'reg_lambda': 0.01}, {'gamma': 0.0, 'reg_lambda': 0.1}, {'gamma': 0.0, 'reg_lambda': 1}, {'gamma': 0.0, 'reg_lambda': 10}, {'gamma': 0.0, 'reg_lambda': 100}, {'gamma': 0.1, 'reg_lambda': 0.001}, {'gamma': 0.1, 'reg_lambda': 0.01}, {'gamma': 0.1, 'reg_lambda': 0.1}, {'gamma': 0.1, 'reg_lambda': 1}, {'gamma': 0.1, 'reg_lambda': 10}, {'gamma': 0.1, 'reg_lambda': 100}, {'gamma': 0.2, 'reg_lambda': 0.001}, {'gamma': 0.2, 'reg_lambda': 0.01}, {'gamma': 0.2, 'reg_lambda': 0.1}, {'gamma': 0.2, 'reg_lambda': 1}, {'gamma': 0.2, 'reg_lambda': 10}, {'gamma': 0.2, 'reg_lambda': 100}, {'gamma': 0.3, 'reg_lambda': 0.001}, {'gamma': 0.3, 'reg_lambda': 0.01}, {'gamma': 0.3, 'reg_lambda': 0.1}, {'gamma': 0.3, 'reg_lambda': 1}, {'gamma': 0.3, 'reg_lambda': 10}, {'gamma': 0.3, 'reg_lambda': 100}], 'split0_test_score': array([0.89507463, 0.8958209 , 0.89537313, 0.89880597, 0.89955224,
       0.89843284, 0.8938806 , 0.89261194, 0.89410448, 0.89679104,
       0.89902985, 0.89783582, 0.89328358, 0.89134328, 0.89365672,
       0.89507463, 0.89776119, 0.89671642, 0.89283582, 0.89358209,
       0.89097015, 0.89537313, 0.89723881, 0.8961194 ]), 'mean_test_score': array([0.89507463, 0.8958209 , 0.89537313, 0.89880597, 0.89955224,
       0.89843284, 0.8938806 , 0.89261194, 0.89410448, 0.89679104,
       0.89902985, 0.89783582, 0.89328358, 0.89134328, 0.89365672,
       0.89507463, 0.89776119, 0.89671642, 0.89283582, 0.89358209,
       0.89097015, 0.89537313, 0.89723881, 0.8961194 ]), 'std_test_score': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0.]), 'rank_test_score': array([14, 11, 12,  3,  1,  4, 17, 22, 16,  8,  2,  5, 20, 23, 18, 14,  6,
        9, 21, 19, 24, 12,  7, 10])}
Best Params:  {'gamma': 0.0, 'reg_lambda': 10}
Best Score: 0.8996

From the results below, we can see that gamma=0 and L2 norm paramter reg_lambda=10 give the best score 0.8996.

Optimaize learning rate

learning rate is the step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features, and eta shrinks the feature weights to make the boosting process more conservative.

other_params = {'objective': 'multi:softprob', 'random_state': 42, 'eval_metric': 'auc', 'subsample': 0.8, 
                'n_estimators':200, 'max_depth': 10, 'gamma': 0, 'reg_lambda': 10}
model = xgb.XGBClassifier(**other_params)
print(model)

cv_params = {'learning_rate': lr_test}
njobs = 4
clf = gridsearch_cv(model, cv_params, X, y, ps, njobs)

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, eval_metric='auc',
              gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=None,
              max_delta_step=None, max_depth=10, min_child_weight=None,
              missing=nan, monotone_constraints=None, n_estimators=200,
              n_jobs=None, num_parallel_tree=None, objective='multi:softprob',
              random_state=42, reg_alpha=None, reg_lambda=10,
              scale_pos_weight=None, subsample=0.8, tree_method=None,
              validate_parameters=None, verbosity=None)
GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, eval_metric='auc',
                                     gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=10, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=200, n_jobs=None,
                                     num_parallel_tree=None,
                                     objective='multi:softprob',
                                     random_state=42, reg_alpha=None,
                                     reg_lambda=10, scale_pos_weight=None,
                                     subsample=0.8, tree_method=None,
                                     validate_parameters=None, verbosity=None),
             n_jobs=4, param_grid={'learning_rate': [0.01, 0.1, 0.2, 0.4]},
             verbose=10)
Fitting 1 folds for each of 4 candidates, totalling 4 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  9.3min
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed: 10.6min remaining: 10.6min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 11.4min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed: 11.4min finished

print(clf.cv_results_)
print('Best Params: ', clf.best_params_) 
print('Best Score: %0.4g' %clf.best_score_)

{'mean_fit_time': array([664.7326026 , 671.92150831, 622.16494417, 548.31287384]), 'std_fit_time': array([0., 0., 0., 0.]), 'mean_score_time': array([5.62669063, 5.99123359, 7.20477128, 5.30254006]), 'std_score_time': array([0., 0., 0., 0.]), 'param_learning_rate': masked_array(data=[0.01, 0.1, 0.2, 0.4],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'learning_rate': 0.01}, {'learning_rate': 0.1}, {'learning_rate': 0.2}, {'learning_rate': 0.4}], 'split0_test_score': array([0.8819403 , 0.89940299, 0.90067164, 0.89955224]), 'mean_test_score': array([0.8819403 , 0.89940299, 0.90067164, 0.89955224]), 'std_test_score': array([0., 0., 0., 0.]), 'rank_test_score': array([4, 3, 1, 2])}
Best Params:  {'learning_rate': 0.2}
Best Score: 0.9007

From the results below, we can see that learning_rate=0.2 gives the best score 0.901.

In conclusion, the setting of hyperparameters {n_estimators=200, max_depth=10, gamma=0, reg_lambda=10, learning_rate=0.2} will produce the best XGBoost model.

Using the optimal settings and test data to predict

xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42, eval_metric="auc", subsample=0.8,
                             n_estimators=200, max_depth=10, gamma=0, reg_lambda=10, learning_rate = 0.2)

xgb_model.fit(xtr, ytr, verbose=10)
print(xgb_model)
y_pred = xgb_model.predict(xts)   #test data

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.2, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', random_state=42,
              reg_alpha=0, reg_lambda=10, scale_pos_weight=None, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

#accuracy
accuracy = accuracy_score(yts, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#confusion matrix
CM = confusion_matrix(yts, y_pred)
# Normalize the confusion matrix
CMsum = np.sum(CM,1)
CM = CM / CMsum[np.newaxis,:]

# Plot the confusion matrix
plt.imshow(CM, interpolation='none')
plt.colorbar()
plt.title('confusion matrix for XGBoost')
plt.xlabel('prediction')
plt.ylabel('true')

#importance plot
xgb.plot_importance(xgb_model)

#draw target tree: converts the target tree to a graphviz instance
#xgb.to_graphviz(xgb_model)

Accuracy: 89.82%

png

Note that this is my lab assignment for Machine learning course!

Last update: 12/04/2020

Parameter Tuning for XBoost