Sklearn Pipelines and XGBoost
Building ML Pipelines with Scikit-learn and XGBoost
Data Pipelines and XGBoost
- https://www.kaggle.com/alexisbcook/pipelines
- https://www.kaggle.com/alexisbcook/xgboost
Data from Kaggle
Pipelines
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
DATA_FILE = 'sample-data/mobile-price-classification/train.csv'
Import Data
df = pd.read_csv(DATA_FILE)
df.head()
battery_power | blue | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | pc | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | price_range | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 842 | 0 | 2.2 | 0 | 1 | 0 | 7 | 0.6 | 188 | 2 | 2 | 20 | 756 | 2549 | 9 | 7 | 19 | 0 | 0 | 1 | 1 |
1 | 1021 | 1 | 0.5 | 1 | 0 | 1 | 53 | 0.7 | 136 | 3 | 6 | 905 | 1988 | 2631 | 17 | 3 | 7 | 1 | 1 | 0 | 2 |
2 | 563 | 1 | 0.5 | 1 | 2 | 1 | 41 | 0.9 | 145 | 5 | 6 | 1263 | 1716 | 2603 | 11 | 2 | 9 | 1 | 1 | 0 | 2 |
3 | 615 | 1 | 2.5 | 0 | 0 | 0 | 10 | 0.8 | 131 | 6 | 9 | 1216 | 1786 | 2769 | 16 | 8 | 11 | 1 | 0 | 0 | 2 |
4 | 1821 | 1 | 1.2 | 0 | 13 | 1 | 44 | 0.6 | 141 | 2 | 14 | 1208 | 1212 | 1411 | 8 | 2 | 15 | 1 | 1 | 0 | 1 |
Separate Variables
X = df.drop('price_range', axis=1)
y = df.price_range
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
Split Data types
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_train.head()
battery_power | blue | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | pc | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
582 | 1232 | 0 | 2.9 | 1 | 1 | 1 | 24 | 0.3 | 169 | 5 | 17 | 361 | 809 | 1257 | 16 | 10 | 16 | 1 | 0 | 0 |
159 | 1840 | 0 | 0.5 | 1 | 12 | 0 | 34 | 0.7 | 142 | 1 | 16 | 311 | 1545 | 1078 | 8 | 0 | 10 | 0 | 0 | 0 |
1827 | 1692 | 0 | 2.1 | 0 | 4 | 1 | 2 | 0.9 | 106 | 1 | 17 | 1899 | 1904 | 3779 | 9 | 3 | 7 | 1 | 1 | 1 |
318 | 508 | 0 | 0.8 | 0 | 7 | 1 | 42 | 0.3 | 94 | 1 | 8 | 39 | 557 | 663 | 13 | 12 | 7 | 1 | 0 | 0 |
708 | 977 | 1 | 2.8 | 1 | 2 | 0 | 35 | 0.6 | 165 | 2 | 15 | 1502 | 1862 | 3714 | 19 | 3 | 10 | 0 | 1 | 1 |
Create Transformers for the diferent types of data
numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
Add these transformers to a Preprocessor
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
Define Model
model = RandomForestRegressor(n_estimators=100, random_state=0)
Create Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)])
Run the Pipeline
pipeline.fit(X_train, y_train)
Pipeline(memory=None,
steps=[('preprocessor',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
SimpleImputer(add_indicator=False,
copy=True,
fill_value=None,
missing_values=nan,
strategy='constant',
verbose=0),
['battery_power', 'blue',
'clock_speed', 'dual_sim',
'fc', 'four_g', 'int_memory',
'm_dep...
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
max_samples=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=None,
oob_score=False, random_state=0,
verbose=0, warm_start=False))],
verbose=False)
predictions = pipeline.predict(X_valid)
results = X_valid
results['predicted'] = predictions
results['actual'] = y_valid
results['diff'] = abs(results['predicted'] - results['actual'])
results.head(10)
battery_power | blue | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | pc | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | predicted | actual | diff | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
405 | 1454 | 1 | 0.5 | 1 | 1 | 0 | 34 | 0.7 | 83 | 4 | 3 | 250 | 1033 | 3419 | 7 | 5 | 5 | 1 | 1 | 0 | 2.98 | 3 | 0.02 |
1190 | 1092 | 1 | 0.5 | 1 | 10 | 0 | 11 | 0.5 | 167 | 3 | 14 | 468 | 571 | 737 | 14 | 4 | 11 | 0 | 1 | 0 | 0.00 | 0 | 0.00 |
1132 | 1524 | 1 | 1.8 | 1 | 0 | 0 | 10 | 0.6 | 174 | 4 | 1 | 154 | 550 | 2678 | 16 | 5 | 13 | 1 | 0 | 1 | 1.97 | 2 | 0.03 |
731 | 1807 | 1 | 2.1 | 0 | 2 | 0 | 49 | 0.8 | 125 | 1 | 10 | 337 | 1384 | 1906 | 17 | 13 | 13 | 0 | 1 | 1 | 1.69 | 2 | 0.31 |
1754 | 1086 | 1 | 1.7 | 1 | 0 | 1 | 43 | 0.2 | 111 | 6 | 1 | 56 | 1150 | 3285 | 11 | 5 | 17 | 1 | 1 | 0 | 2.70 | 2 | 0.70 |
1178 | 909 | 1 | 0.5 | 1 | 9 | 0 | 30 | 0.4 | 97 | 3 | 10 | 290 | 773 | 594 | 12 | 0 | 4 | 1 | 1 | 1 | 0.00 | 0 | 0.00 |
1533 | 642 | 1 | 0.5 | 0 | 0 | 1 | 38 | 0.8 | 86 | 5 | 10 | 887 | 1775 | 435 | 9 | 2 | 2 | 1 | 1 | 0 | 0.08 | 0 | 0.08 |
1303 | 888 | 0 | 2.6 | 1 | 2 | 1 | 33 | 0.4 | 198 | 2 | 17 | 327 | 1683 | 3407 | 12 | 1 | 20 | 1 | 0 | 0 | 2.68 | 3 | 0.32 |
1857 | 914 | 1 | 0.7 | 0 | 1 | 1 | 60 | 0.9 | 198 | 5 | 4 | 740 | 840 | 3736 | 14 | 8 | 5 | 1 | 0 | 0 | 2.84 | 3 | 0.16 |
18 | 1131 | 1 | 0.5 | 1 | 11 | 0 | 49 | 0.6 | 101 | 5 | 18 | 658 | 878 | 1835 | 19 | 13 | 16 | 1 | 1 | 0 | 0.95 | 1 | 0.05 |
Get the Mean Absolute Error
The MAE is a value that tells us within what distance of the actual value our prediction will fall, in this case it means that the model will return a result within 0.17 of the actual result (in our case our expected values range between 1 and 3) on average
score = mean_absolute_error(y_valid, predictions)
print('MAE:', score)
XGBoost
We'll use the same data as above
df = pd.read_csv(DATA_FILE)
X = df.drop('price_range', axis=1)
y = df.price_range
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
X_train_full[cname].dtype == "object"]
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
Train the Model
Using the Default Parameters
1. Create Model Instance
Below we have an example of a model instance created using no parameters, so everything is defaulted, and we can train that like so:
model = XGBRegressor()
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)])
pipeline.fit(X_train, y_train)
Some of the params we can set are:
n_estimators
which is essentially how many models we want in the ensemble, this is usually between 100 and 1000 but is impacted by the learning ratelearning_rate
is how much we want the model to retain between passes, by default this is0.1
, but we can choose a lower value which will mean the model retains less, this can help us to prevent overfittingearly_stopping_rounds
is the number of rounds after deteration that we want the model to stop increasing then_estimators
this is done by giving it a set of testing dataeval_set
which it will use to optimize with, a good value for this isearly_stopping_rounds=5
objective
is a string or function that lets us specify the objective/type of model we would like to build - a list ofobjective
s can be found here- If using a multi-class (
multi:softmax
) classifier you also have to state the number of classes asnum_class=4
Below we'll use a bit of a more complex mode configuration
model = XGBRegressor(n_estimators=1000, learning_rate=0.1, objective='multi:softmax', num_class=4)
2. Add the Model to a Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)])
3. Train the Pipeline
- Note that we need to pre-format our
eval_set
data so that it has the proprocessing steps applied so that the data structures are aligned - We also need to prefix any inputs that we want passed on to our model with
model__
so that the pipeline passes it to the correct object
preprocessor.fit(X_valid)
X_valid_transformed = preprocessor.transform(X_valid)
pipeline.fit(X_train, y_train,
model__early_stopping_rounds=20,
model__eval_set=[(X_valid_transformed, y_valid)],
model__verbose=False)
Pipeline(memory=None,
steps=[('preprocessor',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
SimpleImputer(add_indicator=False,
copy=True,
fill_value=None,
missing_values=nan,
strategy='constant',
verbose=0),
['battery_power', 'blue',
'clock_speed', 'dual_sim',
'fc', 'four_g', 'int_memory',
'm_dep...
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, gamma=0,
importance_type='gain', learning_rate=0.1,
max_delta_step=0, max_depth=3, min_child_weight=1,
missing=None, n_estimators=1000, n_jobs=1,
nthread=None, num_class=4,
objective='multi:softmax', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
seed=None, silent=None, subsample=1,
verbosity=1))],
verbose=False)
4. Predict using the Pipeline
predictions = pipeline.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))
Cross Validation
We can do cross-validation using the cross_val_score
function from sklearn
by:
- Defining the pipeline
- Defining the number of folds
- Defining the model
- Applying the cross-validation to the pipeline
1. Define the Pipeline
df = pd.read_csv(DATA_FILE)
X = df.drop('price_range', axis=1)
y = df.price_range
numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
# n_estimators based on the previous value
model = XGBRegressor(n_estimators=190, learning_rate=0.1, objective='multi:softmax', num_class=4)
validation_result = cross_val_score(pipeline, X, y, cv=3)
validation_result