Introduction
In this second part of the “journey through tyecon
” series, I want to
showcase the package’s facilities in simplifying tasks common to the predictive
modelling part of data analysis. I will be using the Food.com dataset as
before.
Setup and Data Import
library(tidyverse)
library(tidyselect)
library(magrittr)
library(vroom)
library(tyecon)
library(rsample)
library(yardstick)
library(glmnet)
library(earth)
library(pls)
knitr::opts_chunk$set(fig.path = "")
knitr::opts_chunk$set(dev = 'svg')
theme_set(theme_light())
set.seed(123)
recipes <- vroom("~/Workspace/foodrecipes/RAW_recipes.csv")
interacts <- vroom("~/Workspace/foodrecipes/RAW_interactions.csv")
Nutrition values, except for calories, are all percent of daily value, a daily quota filled by the amount of the percentage by the consumption of the specific food.
nutrs <- recipes %->% {
ids <- id
select(nutrition)
mutate(nutrition = str_replace_all(nutrition, "[\\[\\]\\']", ""))
separate(nutrition,
into = c(
"calories", "sat_fat", "sugar", "sodium", "protein", "tot_fat", "carbs"
),
sep = ", ", convert = TRUE
)
# Center and scale too
mutate(across(everything(), \(col) scale(col)[, 1]))
set_attr("ids", ids)
}
nutrs
## # A tibble: 231,637 × 7
## calories sat_fat sugar sodium protein tot_fat carbs
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.355 -0.464 -0.0891 -0.228 -0.559 -0.464 -0.141
## 2 -0.253 -0.232 -0.105 -0.0996 -0.217 -0.108 -0.178
## 3 -0.172 -0.181 -0.0654 0.135 0.0738 -0.189 -0.129
## 4 -0.0890 -0.245 -0.0929 -0.213 -0.354 -0.383 0.0543
## 5 -0.102 -0.451 0.316 -0.0542 -0.542 -0.464 0.152
## 6 -0.264 -0.335 -0.0366 -0.206 -0.439 -0.260 -0.105
## 7 -0.0784 0.217 -0.0966 -0.0466 -0.491 -0.220 -0.117
## 8 0.534 0.603 0.367 1.86 1.05 0.411 0.250
## 9 3.19 2.80 1.53 0.613 1.58 3.92 2.50
## 10 1.85 1.59 1.11 0.582 0.467 2.69 1.50
## # … with 231,627 more rows
And splitting into training and validation:
nutrs_train:nutrs_test %<-%
(initial_split(nutrs) %>% {
list(training(.), testing(.))
})
Predicting Protein from PDV
Of course, since this is an index, it should just work well.
Linear model
lm_model <- lm(protein ~ .,
data = nutrs_train) %$>% {
model <- .
summary(.) %$>% {
coefs <- as_tibble(coefficients, rownames = "term")
stats <-
tibble(r.squared, sigma, df = fstatistic[2], fstat = fstatistic[1])
}
pred_metrics <- {
pred_tibble <- tibble(
truth = fitted.values(.) + residuals(.), estimate = fitted.values(.)
)
map_dfr(c(rmse, rsq, rsq_trad, msd, mae), ~ .(pred_tibble, truth, estimate))
}
}
lm_model
## $model
##
## Call:
## lm(formula = protein ~ ., data = nutrs_train)
##
## Coefficients:
## (Intercept) calories sat_fat sugar sodium tot_fat carbs
## -0.0002895 7.3769225 -2.7190916 -0.1166868 0.0264789 0.0542644 -5.7393293
##
##
## $coefs
## # A tibble: 7 × 5
## term Estimate `Std. Error` `t value` `Pr(>|t|)`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.000290 0.000907 -0.319 7.50e- 1
## 2 calories 7.38 0.00933 791. 0
## 3 sat_fat -2.72 0.00412 -660. 0
## 4 sugar -0.117 0.00520 -22.5 1.62e-111
## 5 sodium 0.0265 0.000861 30.8 2.90e-207
## 6 tot_fat 0.0543 0.00181 30.0 2.59e-197
## 7 carbs -5.74 0.00987 -582. 0
##
## $stats
## # A tibble: 1 × 4
## r.squared sigma df fstat
## <dbl> <dbl> <dbl> <dbl>
## 1 0.849 0.378 6 163409.
##
## $pred_metrics
## # A tibble: 5 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 3.78e- 1
## 2 rsq standard 8.49e- 1
## 3 rsq_trad standard 8.49e- 1
## 4 msd standard 1.62e-18
## 5 mae standard 1.24e- 1
Predictions on the test data:
map_dfr(c(rmse, rsq, rsq_trad, msd, mae), ~ .(
tibble(
truth = nutrs_test$calories, estimate = predict(lm_model$model, nutrs_test)
),
truth, estimate
))
## # A tibble: 5 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.717
## 2 rsq standard 0.467
## 3 rsq_trad standard -0.142
## 4 msd standard -0.00258
## 5 mae standard 0.338
Multiple models together
We first build the general interface function for our loop over models:
lasso <- partial(glmnet, alpha = 1)
ridge <- partial(glmnet, alpha = 0)
# prefix with dot dot to avoid name collision
regression <- convoke(
list(formula, data),
lasso(
x = model.matrix(formula, data)[, -1],
y = data[[all.vars(formula)[1]]]
),
ridge(
x = model.matrix(formula, data)[, -1],
y = data[[all.vars(formula)[1]]]
),
earth(formula, data),
plsr(formula, data = data)
)
regression
## convoke function
## interfaces: lasso(), ridge(), earth(), plsr()
## args: formula, data, interface = lasso, interface.args
And also the predict function:
pred_vec <- function(x, newdata, formula, ...) UseMethod("pred_vec")
pred_vec.elnet <- function(x, newdata, formula, ...) {
predict(x, newx = model.matrix(formula, newdata)[, -1], ...)[, 1]
}
pred_vec.earth <- function(x, newdata, formula, ...) {
predict(x, newdata = newdata, ...)[, 1]
}
pred_vec.mvr <- function(x, newdata, formula, ...) {
retval <- predict(x, newdata = newdata, ...)
retval[, , dim(retval)[[3]]]
}
pred_vec_regs <- conflate(pred_vec(x, newdata, formula))
pred_vec_regs
## conflate function for pred_vec
## args: x, newdata, formula, object.args
And the main loop:
models_preds <- control(
{
holdout <- assessment(fold$splits)
model <- regression(protein ~ .,
analysis(fold$splits),
interface = interface, lasso.lambda = lambda,
earth.degree = degree, plsr.ncomp = 6
)
holdout$.fit <- pred_vec_regs(model, holdout, protein ~ .)
list(model = model, rmse = rmse(holdout, protein, .fit)[[".estimate"]])
},
fold = transpose(vfold_cv(nutrs_train, 5)) ~ 1,
interface = c("lasso", "earth", "plsr") ~ 2,
lambda = c(0.1, 0.2) ~ 3,
degree = c(1, 2) ~ 4,
ncomp = c(5, 6) ~ 5,
.refiner = ~ confine(.,
lambda = interface %in% c("lasso", "ridge"), degree = interface == "earth",
ncomp = interface == "plsr"
),
.unnest_value = TRUE
)
models_preds
## # A tibble: 30 × 7
## fold interface lambda degree ncomp model rmse
## <list> <chr> <dbl> <dbl> <dbl> <list> <dbl>
## 1 <named list [2]> lasso 0.1 NA NA <elnet> 0.870
## 2 <named list [2]> lasso 0.2 NA NA <elnet> 0.911
## 3 <named list [2]> earth NA 1 NA <earth> 0.351
## 4 <named list [2]> earth NA 2 NA <earth> 0.341
## 5 <named list [2]> plsr NA NA 5 <mvr> 0.330
## 6 <named list [2]> plsr NA NA 6 <mvr> 0.330
## 7 <named list [2]> lasso 0.1 NA NA <elnet> 1.02
## 8 <named list [2]> lasso 0.2 NA NA <elnet> 1.09
## 9 <named list [2]> earth NA 1 NA <earth> 2.50
## 10 <named list [2]> earth NA 2 NA <earth> 4.84
## # … with 20 more rows
Now we can choose the appropriate model by testing the data on the validation set:
models_chosen <- group_by(models_preds, across(-c(fold, model, rmse))) %>%
summarise(model = list(first(model)), rmse = mean(rmse), .groups = "drop")
models_chosen
## # A tibble: 6 × 6
## interface lambda degree ncomp model rmse
## <chr> <dbl> <dbl> <dbl> <list> <dbl>
## 1 earth NA 1 NA <earth> 0.827
## 2 earth NA 2 NA <earth> 1.60
## 3 lasso 0.1 NA NA <elnet> 0.868
## 4 lasso 0.2 NA NA <elnet> 0.895
## 5 plsr NA NA 5 <mvr> 0.394
## 6 plsr NA NA 6 <mvr> 0.394
And the validation set:
mutate(rowwise(models_chosen), valid_rmse =
{
valid_set <- nutrs_test
valid_set$.fit <- pred_vec_regs(model, valid_set, protein ~ .)
rmse(valid_set, protein, .fit)[[".estimate"]]
}
)
## # A tibble: 6 × 7
## # Rowwise:
## interface lambda degree ncomp model rmse valid_rmse
## <chr> <dbl> <dbl> <dbl> <list> <dbl> <dbl>
## 1 earth NA 1 NA <earth> 0.827 0.397
## 2 earth NA 2 NA <earth> 1.60 0.335
## 3 lasso 0.1 NA NA <elnet> 0.868 0.903
## 4 lasso 0.2 NA NA <elnet> 0.895 0.944
## 5 plsr NA NA 5 <mvr> 0.394 0.403
## 6 plsr NA NA 6 <mvr> 0.394 0.403