Regresión lineal múltiple

Ajuste y selección de modelos

Author

Edimer David Jaramillo

Published

October 3, 2024

Bibliotecas

Code
# - https://interactions.jacob-long.com/
# - https://marginaleffects.com/
# - https://strengejacke.github.io/ggeffects/

library(tidyverse)
library(ggpubr)
library(qqplotr)
library(janitor)

library(corrplot)
library(corrr)
library(DataExplorer)

library(tidymodels)

Datos

Code
datos <- read_csv("datos/Discovery Farms Dataset.csv") |>
  select(Crop,
         Soil_Condition,
         Soil_pH_6,
         Precip_Frozen,
         `Precip_Non-frozen`,
         Load_DP) |>
  clean_names() |>
  filter(load_dp > 0)

datos |> head()

Exploración

Matriz de correlaciones

Code
datos |>
  select(where(is.numeric)) |>
  cor(use = "pairwise.complete.obs") |>
  corrplot(
    diag = FALSE,
    type = "lower",
    method = "pie",
    tl.col = "black",
    tl.srt = 45
  )

Code
mtx_correlacion <- datos |> 
  select(where(is.numeric)) |> 
  correlate()

network_plot(mtx_correlacion)

Code
plot_correlation(datos, type = "continuous")

Gráficos de dispersión

Code
datos |> 
  select(where(is.numeric)) |> 
  pivot_longer(cols = -load_dp) |> 
  ggplot(aes(x = value, y = load_dp)) +
  facet_wrap(~name, scales = "free_x") +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

Code
datos |> 
  select(where(is.numeric)) |> 
  pivot_longer(cols = -load_dp) |> 
  ggplot(aes(x = value, y = load_dp)) +
  facet_wrap(~name, scales = "free_x") +
  geom_point() +
  scale_y_log10() +
  scale_x_log10() +
  geom_smooth(method = "lm", se = FALSE)

Code
datos |> 
  select(where(is.numeric), soil_condition) |> 
  pivot_longer(cols = -c(load_dp, soil_condition)) |> 
  ggplot(aes(x = value, y = load_dp, color = soil_condition)) +
  facet_wrap(~name, scales = "free_x") +
  geom_point() +
  scale_y_log10() +
  scale_x_log10() +
  geom_smooth(method = "lm", se = FALSE)

Code
datos |> 
  pivot_longer(cols = -c(load_dp, soil_condition, crop)) |> 
  ggplot(aes(x = value, y = load_dp, color = soil_condition)) +
  facet_wrap(~name~crop, scales = "free", ncol = 5) + 
  geom_point() +
  scale_y_log10() +
  scale_x_log10() +
  geom_smooth(method = "lm", se = FALSE) +
  theme(legend.position = "top")

Interacciones

  • Gráfico de dispersión con tres variables numéricas:
Code
datos |> 
  ggplot(aes(x = precip_frozen, y = soil_p_h_6, color = log(load_dp))) +
  facet_wrap(~soil_condition) +
  geom_point(size = 2) +
  scale_x_log10() +
  scale_color_viridis_c()

  • Validando interacción entre precip_frozen y soil_p_h_6:
Code
datos |> 
  mutate(inter1 = precip_frozen * soil_p_h_6) |> 
  ggplot(aes(x = inter1, y = load_dp)) +
  geom_point() +
  scale_y_log10() +
  geom_smooth(method = "lm")

  • Otro gráfico de dispersión con tres variables numéricas:
Code
datos |> 
  ggplot(aes(x = precip_non_frozen, y = soil_p_h_6, color = log(load_dp))) +
  facet_wrap(~soil_condition) +
  geom_point(size = 2) +
  scale_x_log10() +
  scale_color_viridis_c()

  • Validando interacción entre precip_non_frozen y soil_p_h_6:
Code
datos |> 
  mutate(inter2 = precip_non_frozen * soil_p_h_6) |> 
  ggplot(aes(x = inter2, y = load_dp)) +
  geom_point() +
  scale_y_log10() +
  geom_smooth(method = "lm")

  • Otro gráfico de dispersión con tres variables numéricas:
Code
datos |> 
  ggplot(aes(x = precip_non_frozen, y = precip_frozen, color = log(load_dp))) +
  facet_wrap(~soil_condition) +
  geom_point(size = 2) +
  scale_x_log10() +
  scale_color_viridis_c()

  • Validando interacción entre precip_non_frozen y precip_frozen:
Code
datos |> 
  mutate(inter3 = precip_non_frozen * precip_frozen) |> 
  ggplot(aes(x = inter3, y = load_dp)) +
  geom_point() +
  scale_y_log10() +
  geom_smooth(method = "lm")

  • Perfil con variables originales:
Code
datos |> 
  group_by(crop, soil_condition) |> 
  reframe(promedio = mean(load_dp)) |> 
  ggplot(aes(x = crop, y = promedio, color = soil_condition)) +
  geom_point() +
  geom_line(aes(group = soil_condition))

  • Perfil con variables transformadas:
Code
datos |> 
  mutate(log_load_dp = log(load_dp)) |> 
  group_by(crop, soil_condition) |> 
  reframe(promedio = mean(log_load_dp)) |> 
  mutate(promedio = exp(promedio)) |> 
  ggplot(aes(x = crop, y = promedio, color = soil_condition)) +
  geom_point() +
  geom_line(aes(group = soil_condition))

Entrenamiento de modelos

rsample: remuestreo

Code
# Declarar la partición inicial
set.seed(2024)
particion <- initial_split(data = datos, prop = 0.70, strata = "crop")

# Extrear los conjuntos de train-test
datos_train <- training(particion)
datos_test <- testing(particion)

recipes: preprocesado

  • En este caso usaremos la imputación a través de la mediana.
Code
receta <- recipe(load_dp ~ ., data = datos_train) |> 
  step_impute_median(all_numeric_predictors())

parsnip: modelo

Code
modelo <- linear_reg(engine = "lm")
  • Ahora creamos el flujo completo de la modelación:
Code
flujo_modelo <- workflow() |> 
  add_recipe(receta) |> 
  add_model(modelo)

flujo_modelo
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
1 Recipe Step

• step_impute_median()

── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: lm 
  • Ajustar el modelo:
Code
ajuste_modelo <- flujo_modelo |> 
  fit(datos_train)

ajuste_modelo
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
1 Recipe Step

• step_impute_median()

── Model ───────────────────────────────────────────────────────────────────────

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
             (Intercept)                  cropcorn                cropforage  
              -1513.3977                  189.8052                  222.3163  
             croppasture                   cropsoy  soil_conditionnon-frozen  
                405.7085                  208.1419                 -214.8147  
              soil_p_h_6             precip_frozen         precip_non_frozen  
                167.2510                    3.2144                    0.3542  
  • Resumen del modelo:
Code
ajuste_modelo$fit$fit$fit |> summary()

Call:
stats::lm(formula = ..y ~ ., data = data)

Residuals:
   Min     1Q Median     3Q    Max 
-925.1 -352.1 -123.0  136.5 4942.7 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)   
(Intercept)              -1513.3977  1121.1508  -1.350  0.17904   
cropcorn                   189.8052   204.3047   0.929  0.35433   
cropforage                 222.3163   237.7420   0.935  0.35119   
croppasture                405.7085   228.7697   1.773  0.07813 . 
cropsoy                    208.1419   237.0716   0.878  0.38133   
soil_conditionnon-frozen  -214.8147   110.6297  -1.942  0.05399 . 
soil_p_h_6                 167.2510   149.1623   1.121  0.26392   
precip_frozen                3.2144     0.9706   3.312  0.00115 **
precip_non_frozen            0.3542     0.3413   1.038  0.30108   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 701.8 on 154 degrees of freedom
Multiple R-squared:  0.135, Adjusted R-squared:  0.09007 
F-statistic: 3.004 on 8 and 154 DF,  p-value: 0.003683
  • Ahora podemos realizar predicciones con el modelo sobre los nuevos datos (datos_test):
Code
predicciones_test <- predict(object = ajuste_modelo, new_data = datos_test)
predicciones_test

yardstick: evaluación

  • Train:
Code
predicciones_train <- predict(object = ajuste_modelo, new_data = datos_train)

rmse_vec(truth = datos_train$load_dp, 
         estimate = predicciones_train$.pred)
[1] 682.1324
  • Test:
Code
rmse_vec(truth = datos_test$load_dp, 
         estimate = predicciones_test$.pred)
[1] 799.5653

Comparación de modelos

  • Aplicamos la receta a nuestros datos para no tener datos ausentes:
Code
datos_modelos_train <- receta |> 
  prep() |> 
  bake(new_data = datos_train)

datos_modelos_test <- receta |> 
  prep() |> 
  bake(new_data = datos_test)
  • Ajustamos los modelos:
Code
modelo1 <- lm(load_dp ~ ., data = datos_modelos_train)

modelo2 <- lm(load_dp ~ soil_condition * soil_p_h_6
              + precip_frozen + precip_non_frozen,
              data = datos_modelos_train)

modelo3 <- lm(load_dp ~ soil_condition + soil_p_h_6
              + precip_frozen * precip_non_frozen,
              data = datos_modelos_train)

modelo4 <- lm(load_dp ~ soil_condition * soil_p_h_6
              + precip_frozen * precip_non_frozen,
              data = datos_modelos_train)
  • RMSE en train:
Code
pred_train_mod1 <- predict(object = modelo1, newdata = datos_modelos_train)
pred_train_mod2 <- predict(object = modelo2, newdata = datos_modelos_train)
pred_train_mod3 <- predict(object = modelo3, newdata = datos_modelos_train)
pred_train_mod4 <- predict(object = modelo4, newdata = datos_modelos_train)

rmse_vec(truth = datos_modelos_train$load_dp, estimate = pred_train_mod1)
[1] 682.1324
Code
rmse_vec(truth = datos_modelos_train$load_dp, estimate = pred_train_mod2)
[1] 689.3234
Code
rmse_vec(truth = datos_modelos_train$load_dp, estimate = pred_train_mod3)
[1] 689.221
Code
rmse_vec(truth = datos_modelos_train$load_dp, estimate = pred_train_mod4)
[1] 689.1835
  • RMSE en test:
Code
pred_test_mod1 <- predict(object = modelo1, newdata = datos_modelos_test)
pred_test_mod2 <- predict(object = modelo2, newdata = datos_modelos_test)
pred_test_mod3 <- predict(object = modelo3, newdata = datos_modelos_test)
pred_test_mod4 <- predict(object = modelo4, newdata = datos_modelos_test)

rmse_vec(truth = datos_modelos_test$load_dp, estimate = pred_test_mod1)
[1] 799.5653
Code
rmse_vec(truth = datos_modelos_test$load_dp, estimate = pred_test_mod2)
[1] 833.0432
Code
rmse_vec(truth = datos_modelos_test$load_dp, estimate = pred_test_mod3)
[1] 833.0686
Code
rmse_vec(truth = datos_modelos_test$load_dp, estimate = pred_test_mod4)
[1] 832.8929