More Tidymodels

Lecture 24

Dr. Colin Rundel

Hotels Data

Original data from Antonio, Almeida, and Nunes (2019), see data dictionary here

hotels = read_csv(
  'https://tidymodels.org/start/case-study/hotels.csv'
) |>
  mutate(
    across(where(is.character), as.factor)
  )

The data

glimpse(hotels)

Rows: 50,000
Columns: 23
$ hotel                          <fct> City_Hotel, City_Hotel, Resort_Hotel, Resort_Hotel, Re…
$ lead_time                      <dbl> 217, 2, 95, 143, 136, 67, 47, 56, 80, 6, 130, 27, 16, …
$ stays_in_weekend_nights        <dbl> 1, 0, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 1, 0, 1, 1, 1, 4, …
$ stays_in_week_nights           <dbl> 3, 1, 5, 6, 4, 2, 2, 3, 4, 2, 2, 1, 2, 2, 1, 1, 2, 7, …
$ adults                         <dbl> 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, …
$ children                       <fct> none, none, none, none, none, none, children, children…
$ meal                           <fct> BB, BB, BB, HB, HB, SC, BB, BB, BB, BB, BB, BB, BB, BB…
$ country                        <fct> DEU, PRT, GBR, ROU, PRT, GBR, ESP, ESP, FRA, FRA, FRA,…
$ market_segment                 <fct> Offline_TA/TO, Direct, Online_TA, Online_TA, Direct, O…
$ distribution_channel           <fct> TA/TO, Direct, TA/TO, TA/TO, Direct, TA/TO, Direct, TA…
$ is_repeated_guest              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_cancellations         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ previous_bookings_not_canceled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reserved_room_type             <fct> A, D, A, A, F, A, C, B, D, A, A, D, A, D, A, A, D, A, …
$ assigned_room_type             <fct> A, K, A, A, F, A, C, A, D, A, D, D, A, D, A, A, D, A, …
$ booking_changes                <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, …
$ deposit_type                   <fct> No_Deposit, No_Deposit, No_Deposit, No_Deposit, No_Dep…
$ days_in_waiting_list           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 236, 0, 0, 0…
$ customer_type                  <fct> Transient-Party, Transient, Transient, Transient, Tran…
$ average_daily_rate             <dbl> 80.75, 170.00, 8.00, 81.00, 157.60, 49.09, 289.00, 82.…
$ required_car_parking_spaces    <fct> none, none, none, none, none, none, none, none, none, …
$ total_of_special_requests      <dbl> 1, 3, 2, 1, 4, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, …
$ arrival_date                   <date> 2016-09-01, 2017-08-25, 2016-11-19, 2016-04-26, 2016-…

Factors

map_int(hotels, ~ length(levels(.x))) |> keep(~ .x > 0) |> sort()

                      hotel                    children required_car_parking_spaces 
                          2                           2                           2 
               deposit_type               customer_type                        meal 
                          3                           4                           5 
       distribution_channel              market_segment          reserved_room_type 
                          5                           7                           9 
         assigned_room_type                     country 
                         10                         155

The model

Our goal is to develop a predictive model that is able to predict whether a booking will include children or not based on the other characteristics of the booking.

hotels |>
  count(children) |>
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  4038 0.0808
2 none     45962 0.919

Stratifying the test/train split

set.seed(123)

hotel_split = initial_split(
  hotels, strata = children
)

hotel_train = training(hotel_split)
hotel_test = testing(hotel_split)

dim(hotel_train)

[1] 37500    23

dim(hotel_test)

[1] 12500    23

hotel_train |>
  count(children) |>
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  3027 0.0807
2 none     34473 0.919

hotel_test |>
  count(children) |>
  mutate(prop = n/sum(n))

# A tibble: 2 × 3
  children     n   prop
  <fct>    <int>  <dbl>
1 children  1011 0.0809
2 none     11489 0.919

Logistic Regression model

show_engines("logistic_reg")

# A tibble: 7 × 2
  engine    mode          
  <chr>     <chr>         
1 glm       classification
2 glmnet    classification
3 LiblineaR classification
4 spark     classification
5 keras     classification
6 stan      classification
7 brulee    classification

lr_model = logistic_reg() |>
  set_engine("glm")

translate(lr_model)

Logistic Regression Model Specification (classification)

Computational engine: glm 

Model fit template:
stats::glm(formula = missing_arg(), data = missing_arg(), weights = missing_arg(), 
    family = stats::binomial)

Recipe

lr_recipe = recipe(children ~ ., data = hotel_train) |> 
  step_date(arrival_date, features = c("dow", "month"), label=TRUE) |> 
  step_mutate(
    season = case_when(
      arrival_date_month %in% c(12, 1, 2) ~ "Winter",
      arrival_date_month %in% c(3, 4, 5) ~ "Spring",
      arrival_date_month %in% c(6, 7, 8) ~ "Summer",
      arrival_date_month %in% c(9, 10, 11) ~ "Fall"
    ) |>
      factor(levels = c("Winter", "Spring", "Summer", "Fall"))
  ) |>
  step_rm(arrival_date, arrival_date_month) |> 
  step_rm(country) |>
  step_unknown(season) |> 
  step_dummy(all_nominal_predictors()) |> 
  step_zv(all_predictors())

lr_recipe |>
  prep() |>
  bake(new_data = hotel_train)

# A tibble: 37,500 × 56
   lead_time stays_in_weekend_nights stays_in_week_nights adults is_repeated_guest
       <dbl>                   <dbl>                <dbl>  <dbl>             <dbl>
 1         2                       0                    1      2                 0
 2        95                       2                    5      2                 0
 3        67                       2                    2      2                 0
 4        47                       0                    2      2                 0
 5        56                       0                    3      0                 0
 6         6                       2                    2      2                 0
 7       130                       1                    2      2                 0
 8        27                       0                    1      1                 0
 9        46                       0                    2      2                 0
10       423                       1                    1      2                 0
# ℹ 37,490 more rows
# ℹ 51 more variables: previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   booking_changes <dbl>, days_in_waiting_list <dbl>, average_daily_rate <dbl>,
#   total_of_special_requests <dbl>, children <fct>, hotel_Resort_Hotel <dbl>, meal_FB <dbl>,
#   meal_HB <dbl>, meal_SC <dbl>, meal_Undefined <dbl>, market_segment_Complementary <dbl>,
#   market_segment_Corporate <dbl>, market_segment_Direct <dbl>, market_segment_Groups <dbl>,
#   market_segment_Offline_TA.TO <dbl>, market_segment_Online_TA <dbl>, …

Workflow

( lr_work = workflow() |>
    add_model(lr_model) |>
    add_recipe(lr_recipe) 
)

══ Workflow ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────
7 Recipe Steps

• step_date()
• step_mutate()
• step_rm()
• step_rm()
• step_unknown()
• step_dummy()
• step_zv()

── Model ───────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Computational engine: glm

Fit

( lr_fit = lr_work |>
    fit(data = hotel_train) )

══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────
7 Recipe Steps

• step_date()
• step_mutate()
• step_rm()
• step_rm()
• step_unknown()
• step_dummy()
• step_zv()

── Model ───────────────────────────────────────────────────

Call:  stats::glm(formula = ..y ~ ., family = stats::binomial, data = data)

Coefficients:
                        (Intercept)  
                          17.121622  
                          lead_time  
                          -0.001154  
            stays_in_weekend_nights  
                           0.054898  
               stays_in_week_nights  
                          -0.038010  
                             adults  
                           0.645983  
                  is_repeated_guest  
                           0.429371  
             previous_cancellations  
                           0.245846  
     previous_bookings_not_canceled  
                           0.349532  
                    booking_changes  
                          -0.229274  
               days_in_waiting_list  
                           0.007381  
                 average_daily_rate  
                          -0.010432  
          total_of_special_requests  
                          -0.472896  
                 hotel_Resort_Hotel  
                           0.827708  
                            meal_FB  
                          -0.829468  
                            meal_HB  
                          -0.089316  
                            meal_SC  
                           1.276430  
                     meal_Undefined  
                          -0.290937  
       market_segment_Complementary  
                         -13.343605  
           market_segment_Corporate  
                         -12.231269  
              market_segment_Direct  
                         -13.002333  
              market_segment_Groups  
                         -11.959060  
       market_segment_Offline_TA.TO  
                         -13.358978  
           market_segment_Online_TA  
                         -13.489908  

...
and 70 more lines.

Tidy

lr_fit |>
  broom::tidy() |>
  arrange(p.value) |>
  print(n=100)

# A tibble: 56 × 5
   term                estimate std.error statistic  p.value
   <chr>                  <dbl>     <dbl>     <dbl>    <dbl>
 1 average_daily_rate  -1.04e-2   5.14e-4 -20.3     1.40e-91
 2 total_of_special_r… -4.73e-1   2.64e-2 -17.9     9.93e-72
 3 assigned_room_type… -1.19e+0   7.79e-2 -15.3     9.80e-53
 4 reserved_room_type…  1.22e+0   8.91e-2  13.7     1.34e-42
 5 reserved_room_type… -2.57e+0   1.92e-1 -13.4     1.05e-40
 6 hotel_Resort_Hotel   8.28e-1   6.21e-2  13.3     1.65e-40
 7 adults               6.46e-1   4.97e-2  13.0     1.35e-38
 8 assigned_room_type… -1.77e+0   1.45e-1 -12.2     2.63e-34
 9 meal_SC              1.28e+0   1.26e-1  10.2     2.78e-24
10 reserved_room_type… -2.25e+0   2.27e-1  -9.92    3.45e-23
11 booking_changes     -2.29e-1   2.54e-2  -9.03    1.66e-19
12 reserved_room_type… -1.53e+0   1.79e-1  -8.53    1.42e-17
13 reserved_room_type… -1.55e+0   1.95e-1  -7.96    1.70e-15
14 reserved_room_type… -3.38e+0   4.44e-1  -7.62    2.63e-14
15 assigned_room_type… -2.13e+0   3.01e-1  -7.09    1.30e-12
16 assigned_room_type… -1.01e+0   1.43e-1  -7.09    1.30e-12
17 assigned_room_type… -1.06e+0   1.75e-1  -6.08    1.24e- 9
18 assigned_room_type… -1.20e+0   2.16e-1  -5.54    3.06e- 8
19 lead_time           -1.15e-3   3.07e-4  -3.76    1.73e- 4
20 previous_bookings_…  3.50e-1   1.07e-1   3.26    1.13e- 3
21 customer_type_Tran…  4.68e-1   1.53e-1   3.06    2.25e- 3
22 assigned_room_type… -1.29e+0   4.25e-1  -3.03    2.42e- 3
23 meal_FB             -8.29e-1   2.86e-1  -2.90    3.77e- 3
24 arrival_date_dow_W…  3.24e-1   1.19e-1   2.73    6.33e- 3
25 arrival_date_dow_F…  2.62e-1   9.78e-2   2.68    7.42e- 3
26 customer_type_Tran… -3.56e-1   1.34e-1  -2.65    8.13e- 3
27 assigned_room_type… -4.17e-1   1.74e-1  -2.40    1.65e- 2
28 arrival_date_dow_T…  2.87e-1   1.22e-1   2.36    1.84e- 2
29 reserved_room_type…  3.56e-1   1.52e-1   2.35    1.89e- 2
30 arrival_date_dow_T…  2.28e-1   1.08e-1   2.12    3.39e- 2
31 stays_in_week_nigh… -3.80e-2   1.81e-2  -2.10    3.57e- 2
32 is_repeated_guest    4.29e-1   2.14e-1   2.01    4.47e- 2
33 distribution_chann… -6.98e-1   3.72e-1  -1.88    6.06e- 2
34 required_car_parki… -1.18e-1   7.04e-2  -1.67    9.50e- 2
35 arrival_date_dow_S… -1.43e-1   8.61e-2  -1.66    9.60e- 2
36 stays_in_weekend_n…  5.49e-2   3.88e-2   1.42    1.57e- 1
37 days_in_waiting_li…  7.38e-3   5.34e-3   1.38    1.67e- 1
38 meal_HB             -8.93e-2   7.37e-2  -1.21    2.25e- 1
39 customer_type_Group  4.37e-1   4.18e-1   1.04    2.96e- 1
40 meal_Undefined      -2.91e-1   3.13e-1  -0.929   3.53e- 1
41 arrival_date_dow_M…  6.85e-2   9.27e-2   0.739   4.60e- 1
42 distribution_chann… -2.03e-1   3.29e-1  -0.616   5.38e- 1
43 previous_cancellat…  2.46e-1   6.27e-1   0.392   6.95e- 1
44 assigned_room_type… -3.17e-2   4.64e-1  -0.0683  9.46e- 1
45 (Intercept)          1.71e+1   2.55e+2   0.0671  9.46e- 1
46 distribution_chann…  1.37e+1   2.51e+2   0.0546  9.56e- 1
47 market_segment_Onl… -1.35e+1   2.55e+2  -0.0529  9.58e- 1
48 market_segment_Off… -1.34e+1   2.55e+2  -0.0524  9.58e- 1
49 market_segment_Com… -1.33e+1   2.55e+2  -0.0523  9.58e- 1
50 market_segment_Dir… -1.30e+1   2.55e+2  -0.0510  9.59e- 1
51 market_segment_Cor… -1.22e+1   2.55e+2  -0.0479  9.62e- 1
52 market_segment_Gro… -1.20e+1   2.55e+2  -0.0469  9.63e- 1
53 deposit_type_Refun…  1.10e+1   2.78e+2   0.0396  9.68e- 1
54 deposit_type_Non_R… -2.56e-2   1.26e+0  -0.0204  9.84e- 1
55 distribution_chann… -2.00e+1   2.40e+3  -0.00833 9.93e- 1
56 reserved_room_type…  1.27e+1   2.40e+3   0.00529 9.96e- 1

Logistic regression predictions

( lr_train_perf = lr_fit |>
    augment(new_data = hotel_train) |>
    select(children, starts_with(".pred")) )

# A tibble: 37,500 × 4
   children .pred_class .pred_children .pred_none
   <fct>    <fct>                <dbl>      <dbl>
 1 none     none               0.0709      0.929 
 2 none     none               0.0332      0.967 
 3 none     none               0.00982     0.990 
 4 children children           0.920       0.0798
 5 children none               0.441       0.559 
 6 children none               0.115       0.885 
 7 none     none               0.0656      0.934 
 8 none     none               0.0931      0.907 
 9 none     none               0.0220      0.978 
10 none     none               0.0539      0.946 
# ℹ 37,490 more rows

( lr_test_perf = lr_fit |>
    augment(new_data = hotel_test) |>
    select(children, starts_with(".pred")) )

# A tibble: 12,500 × 4
   children .pred_class .pred_children .pred_none
   <fct>    <fct>                <dbl>      <dbl>
 1 none     none              0.0194        0.981
 2 none     none              0.0221        0.978
 3 none     children          0.723         0.277
 4 none     none              0.0721        0.928
 5 none     none              0.000687      0.999
 6 none     none              0.000665      0.999
 7 none     none              0.111         0.889
 8 none     none              0.0464        0.954
 9 none     none              0.0652        0.935
10 none     none              0.103         0.897
# ℹ 12,490 more rows

Performance metrics (within-sample)

conf_mat(lr_train_perf, children, .pred_class)

          Truth
Prediction children  none
  children     1059   418
  none         1968 34055

accuracy(lr_train_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.936

precision(lr_train_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.717

yardstick::roc_curve(lr_train_perf, children, .pred_children) |>
  autoplot()

roc_auc(lr_train_perf, children, .pred_children)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.874

Performance metrics (out-of-sample)

conf_mat(lr_test_perf, children, .pred_class)

          Truth
Prediction children  none
  children      341   134
  none          670 11355

accuracy(lr_test_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.936

precision(lr_test_perf, children, .pred_class)

# A tibble: 1 × 3
  .metric   .estimator .estimate
  <chr>     <chr>          <dbl>
1 precision binary         0.718

yardstick::roc_curve(lr_test_perf, children, .pred_children) |>
  autoplot()

roc_auc(lr_test_perf, children, .pred_children)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.861

Combining ROC curves

bind_rows(
  lr_train_perf |> mutate(name = "logistic - train"),
  lr_test_perf |> mutate(name = "logistic - test")
) |>
  group_by(name) |>
  yardstick::roc_curve(children, .pred_children) |>
  autoplot()

Metric sets

(hotel_metrics = metric_set(accuracy, roc_auc, sensitivity, specificity, brier_class))

A metric set, consisting of:
- `accuracy()`, a class metric          | direction: maximize
- `roc_auc()`, a probability metric     | direction: maximize
- `sensitivity()`, a class metric       | direction: maximize
- `specificity()`, a class metric       | direction: maximize
- `brier_class()`, a probability metric | direction: minimize

hotel_metrics(lr_train_perf, truth = children, estimate = .pred_class, .pred_children)

# A tibble: 5 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary        0.936 
2 sensitivity binary        0.350 
3 specificity binary        0.988 
4 roc_auc     binary        0.874 
5 brier_class binary        0.0519

Lasso

Lasso Model

For this we will be using the glmnet package which supports fitting lasso, ridge and elastic net models.

\[ \min _{\beta_0, \beta} \frac{1}{N} \sum_{i=1}^N w_i l\left(y_i, \beta_0+\beta^T x_i\right)+\lambda\left[(1-\alpha)\|\beta\|{ }_2^2 / 2+\alpha\|\beta\|_1\right], \]

lasso_model = logistic_reg(penalty = tune(), mixture = 1) |>
  set_engine("glmnet")

mixture (\(\alpha\)) determines the type of model fit
- 1 for Lasso,
- 0 for Ridge,
- other for elastic net.
penalty (\(\lambda\)) is the penalty term for coefficient size.

lasso_model |> 
  hardhat::extract_parameter_set_dials()

Collection of 1 parameters for tuning

 identifier    type    object
    penalty penalty nparam[+]

lasso_model |>
  translate()

Logistic Regression Model Specification (classification)

Main Arguments:
  penalty = tune()
  mixture = 1

Computational engine: glmnet 

Model fit template:
glmnet::glmnet(x = missing_arg(), y = missing_arg(), weights = missing_arg(), 
    alpha = 1, family = "binomial")

Lasso Recipe

Lasso (and Ridge) models are sensitive to the scale of the model features. As such it is necessary to normalize all features before fitting the model.

lasso_recipe = lr_recipe |>
  step_normalize(all_predictors())

lasso_recipe |>
  prep() |>
  bake(new_data = hotel_train)

# A tibble: 37,500 × 56
   lead_time stays_in_weekend_nights stays_in_week_nights adults is_repeated_guest
       <dbl>                   <dbl>                <dbl>  <dbl>             <dbl>
 1    -0.858                 -0.938                -0.767  0.337            -0.213
 2     0.160                  1.09                  1.32   0.337            -0.213
 3    -0.146                  1.09                 -0.245  0.337            -0.213
 4    -0.365                 -0.938                -0.245  0.337            -0.213
 5    -0.267                 -0.938                 0.278 -3.59             -0.213
 6    -0.814                  1.09                 -0.245  0.337            -0.213
 7     0.544                  0.0735               -0.245  0.337            -0.213
 8    -0.584                 -0.938                -0.767 -1.63             -0.213
 9    -0.376                 -0.938                -0.245  0.337            -0.213
10     3.75                   0.0735               -0.767  0.337            -0.213
# ℹ 37,490 more rows
# ℹ 51 more variables: previous_cancellations <dbl>, previous_bookings_not_canceled <dbl>,
#   booking_changes <dbl>, days_in_waiting_list <dbl>, average_daily_rate <dbl>,
#   total_of_special_requests <dbl>, children <fct>, hotel_Resort_Hotel <dbl>, meal_FB <dbl>,
#   meal_HB <dbl>, meal_SC <dbl>, meal_Undefined <dbl>, market_segment_Complementary <dbl>,
#   market_segment_Corporate <dbl>, market_segment_Direct <dbl>, market_segment_Groups <dbl>,
#   market_segment_Offline_TA.TO <dbl>, market_segment_Online_TA <dbl>, …

Lasso workflow

( lasso_work = workflow() |>
    add_model(lasso_model) |>
    add_recipe(lasso_recipe)
)

══ Workflow ═════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ─────────────────────────────────────────────────────────────────────
8 Recipe Steps

• step_date()
• step_mutate()
• step_rm()
• step_rm()
• step_unknown()
• step_dummy()
• step_zv()
• step_normalize()

── Model ────────────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Main Arguments:
  penalty = tune()
  mixture = 1

Computational engine: glmnet

v-folds for hyperparameter tuning

( hotel_vf = rsample::vfold_cv(hotel_train, v=5, strata = children) )

#  5-fold cross-validation using stratification 
# A tibble: 5 × 2
  splits               id   
  <list>               <chr>
1 <split [30000/7500]> Fold1
2 <split [30000/7500]> Fold2
3 <split [30000/7500]> Fold3
4 <split [30000/7500]> Fold4
5 <split [30000/7500]> Fold5

grid search

( lasso_grid = lasso_work |>
    tune_grid(
      hotel_vf,
      grid = tibble(
        penalty = 10^seq(-4, -1, length.out = 10)
      ),
      control = control_grid(save_pred = TRUE),
      metrics = metric_set(roc_auc)
    )
)

# Tuning results
# 5-fold cross-validation using stratification 
# A tibble: 5 × 5
  splits               id    .metrics .notes   .predictions
  <list>               <chr> <list>   <list>   <list>      
1 <split [30000/7500]> Fold1 <tibble> <tibble> <tibble>    
2 <split [30000/7500]> Fold2 <tibble> <tibble> <tibble>    
3 <split [30000/7500]> Fold3 <tibble> <tibble> <tibble>    
4 <split [30000/7500]> Fold4 <tibble> <tibble> <tibble>    
5 <split [30000/7500]> Fold5 <tibble> <tibble> <tibble>

Results

lasso_grid |>
  collect_metrics()

# A tibble: 10 × 7
    penalty .metric .estimator  mean     n std_err
      <dbl> <chr>   <chr>      <dbl> <int>   <dbl>
 1 0.0001   roc_auc binary     0.871     5 0.00412
 2 0.000215 roc_auc binary     0.871     5 0.00419
 3 0.000464 roc_auc binary     0.871     5 0.00419
 4 0.001    roc_auc binary     0.871     5 0.00414
 5 0.00215  roc_auc binary     0.871     5 0.00399
 6 0.00464  roc_auc binary     0.865     5 0.00361
 7 0.01     roc_auc binary     0.853     5 0.00459
 8 0.0215   roc_auc binary     0.825     5 0.00684
 9 0.0464   roc_auc binary     0.798     5 0.00670
10 0.1      roc_auc binary     0.5       5 0      
# ℹ 1 more variable: .config <chr>

lasso_grid |> 
  autoplot()

The “Best” models

lasso_grid |>
  show_best(metric = "roc_auc", n=5)

# A tibble: 5 × 7
   penalty .metric .estimator  mean     n std_err .config   
     <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>     
1 0.001    roc_auc binary     0.871     5 0.00414 pre0_mod0…
2 0.00215  roc_auc binary     0.871     5 0.00399 pre0_mod0…
3 0.000464 roc_auc binary     0.871     5 0.00419 pre0_mod0…
4 0.000215 roc_auc binary     0.871     5 0.00419 pre0_mod0…
5 0.0001   roc_auc binary     0.871     5 0.00412 pre0_mod0…

( lasso_best = lasso_grid |>
    select_best() )

# A tibble: 1 × 2
  penalty .config         
    <dbl> <chr>           
1   0.001 pre0_mod04_post0

Extracting predictions

Since we used control_grid(save_pred = TRUE) in tune_grid() we can recover the predictions for the out-of-sample values for each fold

( lasso_train_perf = lasso_grid |>
    collect_predictions(parameters = lasso_best) )

# A tibble: 37,500 × 7
   .pred_children .pred_none id    children  .row penalty
            <dbl>      <dbl> <chr> <fct>    <int>   <dbl>
 1        0.0338       0.966 Fold1 none         2   0.001
 2        0.0116       0.988 Fold1 none         3   0.001
 3        0.0103       0.990 Fold1 none        17   0.001
 4        0.0827       0.917 Fold1 children    22   0.001
 5        0.839        0.161 Fold1 children    27   0.001
 6        0.0357       0.964 Fold1 none        28   0.001
 7        0.0103       0.990 Fold1 none        30   0.001
 8        0.0432       0.957 Fold1 none        38   0.001
 9        0.0204       0.980 Fold1 none        44   0.001
10        0.00831      0.992 Fold1 none        49   0.001
# ℹ 37,490 more rows
# ℹ 1 more variable: .config <chr>

lasso_train_perf |>
  roc_curve(children, .pred_children) |>
  autoplot()

lasso_train_perf |>
  roc_auc(children, .pred_children)

# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.871

Re-fitting

Typically with a tuned model we update the workflow (or model) with the optimal parameter values and then refit using the complete training data,

lasso_work_tuned = finalize_workflow(
  lasso_work,
  lasso_best
)

( lasso_fit = lasso_work_tuned |>
    fit(data=hotel_train) )

══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────
8 Recipe Steps

• step_date()
• step_mutate()
• step_rm()
• step_rm()
• step_unknown()
• step_dummy()
• step_zv()
• step_normalize()

── Model ───────────────────────────────────────────────────

Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial",      alpha = ~1) 

   Df  %Dev   Lambda
1   0  0.00 0.080750
2   1  2.56 0.073580
3   2  5.06 0.067040
4   3  7.45 0.061090
5   3  9.79 0.055660
6   4 12.07 0.050720
7   4 13.94 0.046210
8   5 15.37 0.042110
9   5 16.91 0.038360
10  5 18.09 0.034960
11  5 19.03 0.031850
12  6 19.94 0.029020
13  6 20.86 0.026440
14  6 21.61 0.024090
15  6 22.25 0.021950
16  7 22.95 0.020000
17  7 23.60 0.018230
18  8 24.16 0.016610
19  9 24.71 0.015130
20 10 25.21 0.013790
21 10 25.63 0.012560
22 12 26.00 0.011450
23 13 26.38 0.010430
24 13 26.71 0.009503
25 13 26.99 0.008659
26 15 27.25 0.007890
27 16 27.58 0.007189
28 18 27.93 0.006550
29 19 28.33 0.005968
30 20 28.68 0.005438
31 21 28.99 0.004955
32 23 29.28 0.004515
33 23 29.63 0.004114
34 25 29.95 0.003748
35 26 30.26 0.003415
36 27 30.52 0.003112
37 28 30.75 0.002835
38 31 30.95 0.002584
39 31 31.13 0.002354
40 32 31.29 0.002145
41 33 31.42 0.001954
42 37 31.54 0.001781
43 37 31.64 0.001623
44 38 31.73 0.001478
45 38 31.81 0.001347
46 39 31.87 0.001227

...
and 33 more lines.

Test Performance (out-of-sample)

lasso_test_perf = lasso_fit |>
  augment(new_data = hotel_test) |>
  select(children, starts_with(".pred"))

conf_mat(lasso_test_perf, children, .pred_class)

          Truth
Prediction children  none
  children      331   120
  none          680 11369

hotel_metrics(lasso_test_perf, truth = children, estimate = .pred_class, .pred_children)

# A tibble: 5 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary        0.936 
2 sensitivity binary        0.327 
3 specificity binary        0.990 
4 roc_auc     binary        0.863 
5 brier_class binary        0.0523

yardstick::roc_curve(lasso_test_perf, children, .pred_children) |>
  autoplot()

Comparing models

bind_rows(
  lr_test_perf |> mutate(name = "logistic - test"),
  lasso_test_perf |> mutate(name = "lasso - test")
) |>
  group_by(name) |>
  yardstick::roc_curve(children, .pred_children) |>
  autoplot()

Decision tree

Decision tree models

show_engines("decision_tree")

# A tibble: 5 × 2
  engine mode          
  <chr>  <chr>         
1 rpart  classification
2 rpart  regression    
3 C5.0   classification
4 spark  classification
5 spark  regression

dt_model = decision_tree(
  tree_depth = tune(), 
  min_n = tune(),
  cost_complexity = tune()
) |> 
  set_engine("rpart") |> 
  set_mode("classification")

Recipe & workflow

Same recipe as before but we skip dummy coding as it is not needed by rpart,

dt_recipe = recipe(children ~ ., data = hotel_train) |> 
  step_date(arrival_date, features = c("dow", "month"), label=TRUE) |> 
  step_mutate(
    season = case_when(
      arrival_date_month %in% c(12, 1, 2) ~ "Winter",
      arrival_date_month %in% c(3, 4, 5) ~ "Spring",
      arrival_date_month %in% c(6, 7, 8) ~ "Summer",
      arrival_date_month %in% c(9, 10, 11) ~ "Fall"
    ) |>
      factor(levels = c("Winter", "Spring", "Summer", "Fall"))
  ) |>
  step_rm(arrival_date, arrival_date_month) |> 
  step_rm(country)

dt_work = workflow() |> 
  add_model(dt_model) |> 
  add_recipe(dt_recipe)

Tuning

( dt_grid = grid_regular(
    cost_complexity(), 
    tree_depth(), 
    min_n(), 
    levels = 3
) )

# A tibble: 27 × 3
   cost_complexity tree_depth min_n
             <dbl>      <int> <int>
 1    0.0000000001          1     2
 2    0.00000316            1     2
 3    0.1                   1     2
 4    0.0000000001          8     2
 5    0.00000316            8     2
 6    0.1                   8     2
 7    0.0000000001         15     2
 8    0.00000316           15     2
 9    0.1                  15     2
10    0.0000000001          1    21
# ℹ 17 more rows

dt_tune = dt_work |> 
  tune_grid(
    hotel_vf,
    grid = dt_grid,
    control = control_grid(save_pred = TRUE),
    metrics = hotel_metrics
  )

How many decision tree models were fit by tune_grid()?

Tuning results

dt_tune |>
  collect_metrics() |>
  arrange(desc(mean))

# A tibble: 135 × 9
   cost_complexity tree_depth min_n .metric .estimator  mean
             <dbl>      <int> <int> <chr>   <chr>      <dbl>
 1    0.0000000001          8    40 specif… binary     0.987
 2    0.00000316            8    40 specif… binary     0.987
 3    0.0000000001          8    21 specif… binary     0.987
 4    0.00000316            8    21 specif… binary     0.987
 5    0.0000000001          8     2 specif… binary     0.986
 6    0.00000316            8     2 specif… binary     0.986
 7    0.0000000001         15    40 specif… binary     0.984
 8    0.00000316           15    40 specif… binary     0.984
 9    0.1                   1     2 specif… binary     0.979
10    0.1                   1    21 specif… binary     0.979
# ℹ 125 more rows
# ℹ 3 more variables: n <int>, std_err <dbl>, .config <chr>

“Best” parameters

dt_tune |> 
  show_best(metric = "roc_auc")

# A tibble: 5 × 9
  cost_complexity tree_depth min_n .metric
            <dbl>      <int> <int> <chr>  
1    0.0000000001         15    21 roc_auc
2    0.00000316           15    21 roc_auc
3    0.0000000001         15    40 roc_auc
4    0.00000316           15    40 roc_auc
5    0.0000000001          8     2 roc_auc
# ℹ 5 more variables: .estimator <chr>,
#   mean <dbl>, n <int>, std_err <dbl>,
#   .config <chr>

autoplot(dt_tune)

Re-fitting

(dt_best = dt_tune |>
  select_best(metric = "roc_auc"))

# A tibble: 1 × 4
  cost_complexity tree_depth min_n .config         
            <dbl>      <int> <int> <chr>           
1    0.0000000001         15    21 pre0_mod08_post0

dt_work_tuned = finalize_workflow(
  dt_work,
  dt_best
)

( dt_fit = dt_work_tuned |>
    fit(data=hotel_train))

══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()

── Preprocessor ────────────────────────────────────────────
4 Recipe Steps

• step_date()
• step_mutate()
• step_rm()
• step_rm()

── Model ───────────────────────────────────────────────────
n= 37500 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

    1) root 37500 3027 none (0.080720000 0.919280000)  
      2) reserved_room_type=C,F,G,H 2147  910 children (0.576152771 0.423847229)  
        4) market_segment=Online_TA 1218  350 children (0.712643678 0.287356322)  
          8) average_daily_rate>=140.715 890  196 children (0.779775281 0.220224719)  
           16) adults< 2.5 769  139 children (0.819245774 0.180754226)  
             32) booking_changes< 0.5 581   77 children (0.867469880 0.132530120)  
               64) hotel=City_Hotel 363   26 children (0.928374656 0.071625344)  
                128) reserved_room_type=F 311   15 children (0.951768489 0.048231511) *
                129) reserved_room_type=G 52   11 children (0.788461538 0.211538462)  
                  258) lead_time>=38 23    0 children (1.000000000 0.000000000) *
                  259) lead_time< 38 29   11 children (0.620689655 0.379310345)  
                    518) arrival_date_dow=Mon,Tue,Wed 9    0 children (1.000000000 0.000000000) *
                    519) arrival_date_dow=Sun,Thu,Fri,Sat 20    9 none (0.450000000 0.550000000) *
               65) hotel=Resort_Hotel 218   51 children (0.766055046 0.233944954)  
                130) assigned_room_type=C,D,G,H,I 183   24 children (0.868852459 0.131147541)  
                  260) average_daily_rate>=253.5 41    0 children (1.000000000 0.000000000) *
                  261) average_daily_rate< 253.5 142   24 children (0.830985915 0.169014085)  
                    522) lead_time>=0.5 135   21 children (0.844444444 0.155555556)  
                     1044) average_daily_rate< 161.93 29    1 children (0.965517241 0.034482759) *
                     1045) average_daily_rate>=161.93 106   20 children (0.811320755 0.188679245)  
                       2090) average_daily_rate>=175.855 81   11 children (0.864197531 0.135802469) *
                       2091) average_daily_rate< 175.855 25    9 children (0.640000000 0.360000000)  
                         4182) stays_in_week_nights< 3.5 14    3 children (0.785714286 0.214285714) *
                         4183) stays_in_week_nights>=3.5 11    5 none (0.454545455 0.545454545) *
                    523) lead_time< 0.5 7    3 children (0.571428571 0.428571429) *
                131) assigned_room_type=B,E,F 35    8 none (0.228571429 0.771428571)  
                  262) average_daily_rate>=172.05 21    7 none (0.333333333 0.666666667)  
                    524) lead_time< 34 9    4 children (0.555555556 0.444444444) *
                    525) lead_time>=34 12    2 none (0.166666667 0.833333333) *
                  263) average_daily_rate< 172.05 14    1 none (0.071428571 0.928571429) *
             33) booking_changes>=0.5 188   62 children (0.670212766 0.329787234)  
               66) lead_time>=159.5 36    2 children (0.944444444 0.055555556) *
               67) lead_time< 159.5 152   60 children (0.605263158 0.394736842)  
                134) reserved_room_type=C,F,H 111   36 children (0.675675676 0.324324324)  
                  268) arrival_date_dow=Mon,Tue,Fri 43    9 children (0.790697674 0.209302326) *
                  269) arrival_date_dow=Sun,Wed,Thu,Sat 68   27 children (0.602941176 0.397058824)  
                    538) assigned_room_type=E,F,G,H 54   18 children (0.666666667 0.333333333)  
                     1076) average_daily_rate< 233.25 38    9 children (0.763157895 0.236842105) *
                     1077) average_daily_rate>=233.25 16    7 none (0.437500000 0.562500000) *
                    539) assigned_room_type=B,C,K 14    5 none (0.357142857 0.642857143) *
                135) reserved_room_type=G 41   17 none (0.414634146 0.585365854)  
                  270) arrival_date_dow=Tue,Wed,Fri,Sat 24   10 children (0.583333333 0.416666667)  
                    540) lead_time< 23.5 8    1 children (0.875000000 0.125000000) *
                    541) lead_time>=23.5 16    7 none (0.437500000 0.562500000) *
                  271) arrival_date_dow=Sun,Mon,Thu 17    3 none (0.176470588 0.823529412) *

...
and 468 more lines.

Model extraction

dt_fit |> 
  hardhat::extract_fit_engine() |> 
  plot()

Test Performance (out-of-sample)

dt_test_perf = dt_fit |>
  augment(new_data = hotel_test) |>
  select(children, starts_with(".pred"))

conf_mat(dt_test_perf, children, .pred_class)

          Truth
Prediction children  none
  children      450   268
  none          561 11221

hotel_metrics(dt_test_perf, truth = children, estimate = .pred_class, .pred_children)

# A tibble: 5 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary        0.934 
2 sensitivity binary        0.445 
3 specificity binary        0.977 
4 roc_auc     binary        0.862 
5 brier_class binary        0.0512

yardstick::roc_curve(dt_test_perf, children, .pred_children) |>
  autoplot()

Comparing models

bind_rows(
  lr_test_perf |> mutate(name = "logistic - test"),
  lasso_test_perf |> mutate(name = "lasso - test"),
  dt_test_perf |> mutate(name = "decision tree - test")
) |>
  group_by(name) |>
  yardstick::roc_curve(children, .pred_children) |>
  autoplot()

Random Forest

Random forest models

show_engines("rand_forest")

# A tibble: 6 × 2
  engine       mode          
  <chr>        <chr>         
1 ranger       classification
2 ranger       regression    
3 randomForest classification
4 randomForest regression    
5 spark        classification
6 spark        regression

rf_model = rand_forest(mtry = tune(), min_n = tune(), trees = 100) |> 
  set_engine("ranger", num.threads = 1) |> 
  set_mode("classification")

Recipe & workflow

We again skip dummy coding in the recipe as it is not needed by ranger,

rf_recipe = dt_recipe

rf_work = workflow() |> 
  add_model(rf_model) |> 
  add_recipe(rf_recipe)

Tuning - automatic grid search

rf_tune = rf_work |> 
  tune_grid(
    hotel_vf,
    grid = 10,
    control = control_grid(save_pred = TRUE),
    metrics = metric_set(roc_auc)
  )

rf_tune |> 
  collect_metrics() |>
  arrange(desc(mean))

# A tibble: 10 × 8
    mtry min_n .metric .estimator  mean     n std_err
   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl>
 1     5     2 roc_auc binary     0.912     5 0.00445
 2     3    31 roc_auc binary     0.911     5 0.00446
 3     8    18 roc_auc binary     0.909     5 0.00431
 4    10    35 roc_auc binary     0.907     5 0.00416
 5    15    23 roc_auc binary     0.903     5 0.00394
 6    12     6 roc_auc binary     0.903     5 0.00414
 7    17    40 roc_auc binary     0.903     5 0.00402
 8    19    10 roc_auc binary     0.900     5 0.00345
 9    22    27 roc_auc binary     0.900     5 0.00398
10     1    14 roc_auc binary     0.867     5 0.00384
# ℹ 1 more variable: .config <chr>

“Best” parameters

rf_tune |> 
  show_best(metric = "roc_auc")

# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n
  <int> <int> <chr>   <chr>      <dbl> <int>
1     5     2 roc_auc binary     0.912     5
2     3    31 roc_auc binary     0.911     5
3     8    18 roc_auc binary     0.909     5
4    10    35 roc_auc binary     0.907     5
5    15    23 roc_auc binary     0.903     5
# ℹ 2 more variables: std_err <dbl>,
#   .config <chr>

autoplot(rf_tune)

Re-fitting

rf_best = rf_tune |>
  select_best(metric = "roc_auc")

rf_work_tuned = finalize_workflow(
  rf_work, 
  rf_best
)

( rf_fit = rf_work_tuned |>
    fit(data=hotel_train) )

══ Workflow [trained] ══════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────
4 Recipe Steps

• step_date()
• step_mutate()
• step_rm()
• step_rm()

── Model ───────────────────────────────────────────────────
Ranger result

Call:
 ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~5L,      x), num.trees = ~100, min.node.size = min_rows(~2L, x), num.threads = ~1,      verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) 

Type:                             Probability estimation 
Number of trees:                  100 
Sample size:                      37500 
Number of independent variables:  22 
Mtry:                             5 
Target node size:                 2 
Variable importance mode:         none 
Splitrule:                        gini 
OOB prediction error (Brier s.):  0.04414235

Test Performance (out-of-sample)

rf_test_perf = rf_fit |>
  augment(new_data = hotel_test) |>
  select(children, starts_with(".pred"))

conf_mat(rf_test_perf, children, .pred_class)

          Truth
Prediction children  none
  children      420    88
  none          591 11401

hotel_metrics(rf_test_perf, truth = children, estimate = .pred_class, .pred_children)

# A tibble: 5 × 3
  .metric     .estimator .estimate
  <chr>       <chr>          <dbl>
1 accuracy    binary        0.946 
2 sensitivity binary        0.415 
3 specificity binary        0.992 
4 roc_auc     binary        0.917 
5 brier_class binary        0.0435

yardstick::roc_curve(rf_test_perf, children, .pred_children) |>
  autoplot()

Comparing models

bind_rows(
  lr_test_perf |> mutate(name = "logistic - test"),
  lasso_test_perf |> mutate(name = "lasso - test"),
  dt_test_perf |> mutate(name = "decision tree - test"),
  rf_test_perf |> mutate(name = "random forest - test")
) |>
  group_by(name) |>
  yardstick::roc_curve(children, .pred_children) |>
  autoplot()