Load packages
# Set all of this to get the EXACT SAME results on all platforms
set.seed(2022, "Mersenne-Twister", "Inversion", "Rejection")
library(tidyverse)
library(tidymodels)
library(vip)
library(readr)
library(palmerpenguins)
<- penguins %>% na.omit() penguins
Hands-on Activity
Our goal is to build a model predicting body_mass_g
.
Create an initial split, stratified by the label.
Create a recipe:
- Create dummy variables for categorical features
- Normalize all numeric features
- Remove features with near-zero variance
- Remove features with large absolute correlations with other features
- Remove features that are a linear combination of other features
Specify a linear regression model
Build a workflow
Fit a model on the training data with 10-fold cross-validation, repeated three times, stratified by the label
Examine performance during cross-validation
Fit a final model on the training data and plot variable importance
Evaluate performance in the final, hold-out test set.
Answer key
Click here to view the answer key to the hands-on activity
# 1
<- initial_split(penguins, prop = 0.8, strata = body_mass_g)
bmg_split <- training(bmg_split)
bmg_train <- testing(bmg_split) bmg_test
# 2
<-
bmg_recipe recipe(bmg_train, formula = body_mass_g ~ .) %>%
step_dummy(all_nominal_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_predictors()) %>%
step_lincomb(all_predictors()) %>%
step_normalize(all_numeric_predictors())
# 3
<- linear_reg() %>%
lin_reg set_engine("lm") %>%
set_mode("regression")
# 4
<-
bmg_wflow workflow() %>%
add_model(lin_reg) %>%
add_recipe(bmg_recipe)
#5
<- vfold_cv(
bmg_folds data = bmg_train,
v = 10,
repeats = 3,
strata = body_mass_g
)
<-
bmg_fitr %>%
bmg_wflow fit_resamples(
resamples = bmg_folds
)
# 6
collect_metrics(bmg_fitr)
# 7
<-
bmg_final %>%
bmg_wflow last_fit(bmg_split)
%>% extract_fit_parsnip() %>% vip() bmg_final
# 8.
collect_metrics(bmg_final)
<-
bmg_pred %>%
bmg_final collect_predictions()
%>%
bmg_pred ggplot(aes(x = body_mass_g, y = .pred)) +
geom_point(alpha = .15) +
geom_abline(color = 'darkred') +
coord_obs_pred() +
labs(x = "Observed",
y = "Predicted")