Sklearn Pipelines and XGBoost

Building ML Pipelines with Scikit-learn and XGBoost

Data Pipelines and XGBoost

  • https://www.kaggle.com/alexisbcook/pipelines
  • https://www.kaggle.com/alexisbcook/xgboost

Data from Kaggle

Pipelines

import pandas as pd

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
DATA_FILE = 'sample-data/mobile-price-classification/train.csv'

Import Data

df = pd.read_csv(DATA_FILE)
df.head()
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores pc px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi price_range
0 842 0 2.2 0 1 0 7 0.6 188 2 2 20 756 2549 9 7 19 0 0 1 1
1 1021 1 0.5 1 0 1 53 0.7 136 3 6 905 1988 2631 17 3 7 1 1 0 2
2 563 1 0.5 1 2 1 41 0.9 145 5 6 1263 1716 2603 11 2 9 1 1 0 2
3 615 1 2.5 0 0 0 10 0.8 131 6 9 1216 1786 2769 16 8 11 1 0 0 2
4 1821 1 1.2 0 13 1 44 0.6 141 2 14 1208 1212 1411 8 2 15 1 1 0 1

Separate Variables

X = df.drop('price_range', axis=1)
y = df.price_range
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

Split Data types

categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
my_cols = categorical_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_train.head()
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores pc px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi
582 1232 0 2.9 1 1 1 24 0.3 169 5 17 361 809 1257 16 10 16 1 0 0
159 1840 0 0.5 1 12 0 34 0.7 142 1 16 311 1545 1078 8 0 10 0 0 0
1827 1692 0 2.1 0 4 1 2 0.9 106 1 17 1899 1904 3779 9 3 7 1 1 1
318 508 0 0.8 0 7 1 42 0.3 94 1 8 39 557 663 13 12 7 1 0 0
708 977 1 2.8 1 2 0 35 0.6 165 2 15 1502 1862 3714 19 3 10 0 1 1

Create Transformers for the diferent types of data

numerical_transformer = SimpleImputer(strategy='constant')


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

Add these transformers to a Preprocessor

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Define Model

model = RandomForestRegressor(n_estimators=100, random_state=0)

Create Pipeline

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

Run the Pipeline

pipeline.fit(X_train, y_train)
Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['battery_power', 'blue',
                                                   'clock_speed', 'dual_sim',
                                                   'fc', 'four_g', 'int_memory',
                                                   'm_dep...
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=None,
                                       oob_score=False, random_state=0,
                                       verbose=0, warm_start=False))],
         verbose=False)
predictions = pipeline.predict(X_valid)
results = X_valid
results['predicted'] = predictions
results['actual'] = y_valid
results['diff'] = abs(results['predicted'] - results['actual'])
results.head(10)
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores pc px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi predicted actual diff
405 1454 1 0.5 1 1 0 34 0.7 83 4 3 250 1033 3419 7 5 5 1 1 0 2.98 3 0.02
1190 1092 1 0.5 1 10 0 11 0.5 167 3 14 468 571 737 14 4 11 0 1 0 0.00 0 0.00
1132 1524 1 1.8 1 0 0 10 0.6 174 4 1 154 550 2678 16 5 13 1 0 1 1.97 2 0.03
731 1807 1 2.1 0 2 0 49 0.8 125 1 10 337 1384 1906 17 13 13 0 1 1 1.69 2 0.31
1754 1086 1 1.7 1 0 1 43 0.2 111 6 1 56 1150 3285 11 5 17 1 1 0 2.70 2 0.70
1178 909 1 0.5 1 9 0 30 0.4 97 3 10 290 773 594 12 0 4 1 1 1 0.00 0 0.00
1533 642 1 0.5 0 0 1 38 0.8 86 5 10 887 1775 435 9 2 2 1 1 0 0.08 0 0.08
1303 888 0 2.6 1 2 1 33 0.4 198 2 17 327 1683 3407 12 1 20 1 0 0 2.68 3 0.32
1857 914 1 0.7 0 1 1 60 0.9 198 5 4 740 840 3736 14 8 5 1 0 0 2.84 3 0.16
18 1131 1 0.5 1 11 0 49 0.6 101 5 18 658 878 1835 19 13 16 1 1 0 0.95 1 0.05

Get the Mean Absolute Error

The MAE is a value that tells us within what distance of the actual value our prediction will fall, in this case it means that the model will return a result within 0.17 of the actual result (in our case our expected values range between 1 and 3) on average

score = mean_absolute_error(y_valid, predictions)
print('MAE:', score)

XGBoost

We'll use the same data as above

df = pd.read_csv(DATA_FILE)

X = df.drop('price_range', axis=1)
y = df.price_range

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Train the Model

Using the Default Parameters

1. Create Model Instance

Below we have an example of a model instance created using no parameters, so everything is defaulted, and we can train that like so:

model = XGBRegressor()

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

pipeline.fit(X_train, y_train)

Some of the params we can set are:

  • n_estimators which is essentially how many models we want in the ensemble, this is usually between 100 and 1000 but is impacted by the learning rate
  • learning_rate is how much we want the model to retain between passes, by default this is 0.1, but we can choose a lower value which will mean the model retains less, this can help us to prevent overfitting
  • early_stopping_rounds is the number of rounds after deteration that we want the model to stop increasing the n_estimators this is done by giving it a set of testing data eval_set which it will use to optimize with, a good value for this is early_stopping_rounds=5
  • objective is a string or function that lets us specify the objective/type of model we would like to build - a list of objectives can be found here
  • If using a multi-class (multi:softmax) classifier you also have to state the number of classes as num_class=4

Below we'll use a bit of a more complex mode configuration

model = XGBRegressor(n_estimators=1000, learning_rate=0.1, objective='multi:softmax', num_class=4)

2. Add the Model to a Pipeline

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

3. Train the Pipeline

  • Note that we need to pre-format our eval_set data so that it has the proprocessing steps applied so that the data structures are aligned
  • We also need to prefix any inputs that we want passed on to our model with model__ so that the pipeline passes it to the correct object
preprocessor.fit(X_valid)
X_valid_transformed = preprocessor.transform(X_valid)
pipeline.fit(X_train, y_train, 
                model__early_stopping_rounds=20, 
                model__eval_set=[(X_valid_transformed, y_valid)],
                model__verbose=False)
Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['battery_power', 'blue',
                                                   'clock_speed', 'dual_sim',
                                                   'fc', 'four_g', 'int_memory',
                                                   'm_dep...
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0,
                              importance_type='gain', learning_rate=0.1,
                              max_delta_step=0, max_depth=3, min_child_weight=1,
                              missing=None, n_estimators=1000, n_jobs=1,
                              nthread=None, num_class=4,
                              objective='multi:softmax', random_state=0,
                              reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                              seed=None, silent=None, subsample=1,
                              verbosity=1))],
         verbose=False)

4. Predict using the Pipeline

predictions = pipeline.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Cross Validation

We can do cross-validation using the cross_val_score function from sklearn by:

  1. Defining the pipeline
  2. Defining the number of folds
  3. Defining the model
  4. Applying the cross-validation to the pipeline

1. Define the Pipeline

df = pd.read_csv(DATA_FILE)

X = df.drop('price_range', axis=1)
y = df.price_range

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
# n_estimators based on the previous value
model = XGBRegressor(n_estimators=190, learning_rate=0.1, objective='multi:softmax', num_class=4)
validation_result = cross_val_score(pipeline, X, y, cv=3)
validation_result