Data from Kaggle

# Pipelines

import pandas as pd

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
DATA_FILE = 'sample-data/mobile-price-classification/train.csv'

## Import Data

df = pd.read_csv(DATA_FILE)
df.head()
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores pc px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi price_range
0 842 0 2.2 0 1 0 7 0.6 188 2 2 20 756 2549 9 7 19 0 0 1 1
1 1021 1 0.5 1 0 1 53 0.7 136 3 6 905 1988 2631 17 3 7 1 1 0 2
2 563 1 0.5 1 2 1 41 0.9 145 5 6 1263 1716 2603 11 2 9 1 1 0 2
3 615 1 2.5 0 0 0 10 0.8 131 6 9 1216 1786 2769 16 8 11 1 0 0 2
4 1821 1 1.2 0 13 1 44 0.6 141 2 14 1208 1212 1411 8 2 15 1 1 0 1

## Separate Variables

X = df.drop('price_range', axis=1)
y = df.price_range
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

## Split Data types

categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]
my_cols = categorical_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_train.head()
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores pc px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi
582 1232 0 2.9 1 1 1 24 0.3 169 5 17 361 809 1257 16 10 16 1 0 0
159 1840 0 0.5 1 12 0 34 0.7 142 1 16 311 1545 1078 8 0 10 0 0 0
1827 1692 0 2.1 0 4 1 2 0.9 106 1 17 1899 1904 3779 9 3 7 1 1 1
318 508 0 0.8 0 7 1 42 0.3 94 1 8 39 557 663 13 12 7 1 0 0
708 977 1 2.8 1 2 0 35 0.6 165 2 15 1502 1862 3714 19 3 10 0 1 1

## Create Transformers for the diferent types of data

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

## Add these transformers to a Preprocessor

preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])

## Define Model

model = RandomForestRegressor(n_estimators=100, random_state=0)

## Create Pipeline

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)])

## Run the Pipeline

pipeline.fit(X_train, y_train)
Pipeline(memory=None,
steps=[('preprocessor',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
copy=True,
fill_value=None,
missing_values=nan,
strategy='constant',
verbose=0),
['battery_power', 'blue',
'clock_speed', 'dual_sim',
'fc', 'four_g', 'int_memory',
'm_dep...
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
max_samples=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=None,
oob_score=False, random_state=0,
verbose=0, warm_start=False))],
verbose=False)
predictions = pipeline.predict(X_valid)
results = X_valid
results['predicted'] = predictions
results['actual'] = y_valid
results['diff'] = abs(results['predicted'] - results['actual'])
results.head(10)
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores pc px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi predicted actual diff
405 1454 1 0.5 1 1 0 34 0.7 83 4 3 250 1033 3419 7 5 5 1 1 0 2.98 3 0.02
1190 1092 1 0.5 1 10 0 11 0.5 167 3 14 468 571 737 14 4 11 0 1 0 0.00 0 0.00
1132 1524 1 1.8 1 0 0 10 0.6 174 4 1 154 550 2678 16 5 13 1 0 1 1.97 2 0.03
731 1807 1 2.1 0 2 0 49 0.8 125 1 10 337 1384 1906 17 13 13 0 1 1 1.69 2 0.31
1754 1086 1 1.7 1 0 1 43 0.2 111 6 1 56 1150 3285 11 5 17 1 1 0 2.70 2 0.70
1178 909 1 0.5 1 9 0 30 0.4 97 3 10 290 773 594 12 0 4 1 1 1 0.00 0 0.00
1533 642 1 0.5 0 0 1 38 0.8 86 5 10 887 1775 435 9 2 2 1 1 0 0.08 0 0.08
1303 888 0 2.6 1 2 1 33 0.4 198 2 17 327 1683 3407 12 1 20 1 0 0 2.68 3 0.32
1857 914 1 0.7 0 1 1 60 0.9 198 5 4 740 840 3736 14 8 5 1 0 0 2.84 3 0.16
18 1131 1 0.5 1 11 0 49 0.6 101 5 18 658 878 1835 19 13 16 1 1 0 0.95 1 0.05

## Get the Mean Absolute Error

The MAE is a value that tells us within what distance of the actual value our prediction will fall, in this case it means that the model will return a result within 0.17 of the actual result (in our case our expected values range between 1 and 3) on average

score = mean_absolute_error(y_valid, predictions)
print('MAE:', score)
MAE: 0.171375


## XGBoost

We'll use the same data as above

df = pd.read_csv(DATA_FILE)

X = df.drop('price_range', axis=1)
y = df.price_range

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])

### Using the Default Parameters

#### 1. Create Model Instance

Below we have an example of a model instance created using no parameters, so everything is defaulted, and we can train that like so:

model = XGBRegressor()

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)])

pipeline.fit(X_train, y_train)


Some of the params we can set are:

• n_estimators which is essentially how many models we want in the ensemble, this is usually between 100 and 1000 but is impacted by the learning rate
• learning_rate is how much we want the model to retain between passes, by default this is 0.1, but we can choose a lower value which will mean the model retains less, this can help us to prevent overfitting
• early_stopping_rounds is the number of rounds after deteration that we want the model to stop increasing the n_estimators this is done by giving it a set of testing data eval_set which it will use to optimize with, a good value for this is early_stopping_rounds=5
• objective is a string or function that lets us specify the objective/type of model we would like to build - a list of objectives can be found here
• If using a multi-class (multi:softmax) classifier you also have to state the number of classes as num_class=4

Below we'll use a bit of a more complex mode configuration

model = XGBRegressor(n_estimators=1000, learning_rate=0.1, objective='multi:softmax', num_class=4)

#### 2. Add the Model to a Pipeline

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)])

#### 3. Train the Pipeline

• Note that we need to pre-format our eval_set data so that it has the proprocessing steps applied so that the data structures are aligned
• We also need to prefix any inputs that we want passed on to our model with model__ so that the pipeline passes it to the correct object
preprocessor.fit(X_valid)
X_valid_transformed = preprocessor.transform(X_valid)
pipeline.fit(X_train, y_train,
model__early_stopping_rounds=20,
model__eval_set=[(X_valid_transformed, y_valid)],
model__verbose=False)
Pipeline(memory=None,
steps=[('preprocessor',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('num',
copy=True,
fill_value=None,
missing_values=nan,
strategy='constant',
verbose=0),
['battery_power', 'blue',
'clock_speed', 'dual_sim',
'fc', 'four_g', 'int_memory',
'm_dep...
colsample_bylevel=1, colsample_bynode=1,
colsample_bytree=1, gamma=0,
importance_type='gain', learning_rate=0.1,
max_delta_step=0, max_depth=3, min_child_weight=1,
missing=None, n_estimators=1000, n_jobs=1,
objective='multi:softmax', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
seed=None, silent=None, subsample=1,
verbosity=1))],
verbose=False)

#### 4. Predict using the Pipeline

predictions = pipeline.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))
Mean Absolute Error: 0.0575


## Cross Validation

We can do cross-validation using the cross_val_score function from sklearn by:

1. Defining the pipeline
2. Defining the number of folds
3. Defining the model
4. Applying the cross-validation to the pipeline

### 1. Define the Pipeline

df = pd.read_csv(DATA_FILE)

X = df.drop('price_range', axis=1)
y = df.price_range

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_cols),
('cat', categorical_transformer, categorical_cols)
])
# n_estimators based on the previous value
model = XGBRegressor(n_estimators=190, learning_rate=0.1, objective='multi:softmax', num_class=4)
validation_result = cross_val_score(pipeline, X, y, cv=3)
validation_result