Regresión lineal simple

Transformaciones y predictoras categóricas

Author

Edimer David Jaramillo

Published

September 19, 2024

Bibliotecas

Code
library(tidyverse)
library(ggpubr)
library(qqplotr)
library(janitor)

Datos

Code
datos <- read_csv("datos/Discovery Farms Dataset.csv") |> 
  select(Crop, Soil_Condition, Soil_pH_6, Precip_Frozen, `Precip_Non-frozen`, Load_DP) |> 
  clean_names() |> 
  filter(load_dp > 0)

datos |> head()

Validando normalidad

Code
datos$load_dp |> 
  ggqqplot()

Code
datos$load_dp |> 
  log() |> 
  ggqqplot()

Diagrama de dispersión

Code
datos |> 
  ggplot(aes(x = soil_p_h_6, y = load_dp)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)

Code
datos |> 
  ggplot(aes(x = soil_p_h_6, y = load_dp)) +
  geom_point() +
  scale_y_log10() +
  scale_x_log10() +
  geom_smooth(method = "lm", se = FALSE)

Boxplot individual

Code
datos |> 
  ggplot(aes(x = crop, y = load_dp)) +
  geom_boxplot()

Code
datos |> 
  ggplot(aes(x = soil_condition, y = load_dp)) +
  geom_boxplot()

Code
datos |> 
  ggplot(aes(x = crop, y = load_dp)) +
  geom_boxplot() +
  scale_y_log10()

Code
datos |> 
  ggplot(aes(x = soil_condition, y = load_dp)) +
  geom_boxplot() +
  scale_y_log10()

Boxplot compuesto

Code
datos |> 
  ggplot(aes(x = crop, y = load_dp)) +
  facet_wrap(~soil_condition) +
  geom_boxplot()

Code
datos |> 
  ggplot(aes(x = crop, y = load_dp)) +
  facet_wrap(~soil_condition) +
  geom_boxplot() +
  scale_y_log10()

Resumen numérico

Code
datos |> 
  group_by(crop) |> 
  reframe(promedio = mean(load_dp),
          minimo = min(load_dp),
          maximo = max(load_dp),
          desviacion = sd(load_dp),
          coef_var = (desviacion / promedio) * 100)
Code
datos |>
  mutate(log_load_dp = log(load_dp)) |>
  group_by(crop) |>
  reframe(promedio = mean(log_load_dp, na.rm = TRUE),
          mediana =  median(log_load_dp, na.rm = TRUE),
          desviacion = sd(log_load_dp)) |>
  mutate(
    promedio = exp(promedio),
    desviacion = exp(desviacion),
    mediana = exp(mediana),
    coef_var = (desviacion / promedio) * 100
  )

Modelo con variable categórica

Code
modelo_ori <- lm(load_dp ~ crop, data = datos)
modelo_ori |> summary()

Call:
lm(formula = load_dp ~ crop, data = datos)

Residuals:
   Min     1Q Median     3Q    Max 
-839.2 -346.6 -218.3   26.4 5288.2 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)   227.13     156.94   1.447  0.14918   
cropcorn      189.67     174.26   1.088  0.27754   
cropforage     82.87     206.25   0.402  0.68820   
croppasture   614.04     201.51   3.047  0.00258 **
cropsoy       221.44     199.46   1.110  0.26807   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 768.8 on 231 degrees of freedom
Multiple R-squared:  0.05279,   Adjusted R-squared:  0.03639 
F-statistic: 3.218 on 4 and 231 DF,  p-value: 0.01351
Code
modelo_log <- lm(log(load_dp) ~ crop, data = datos)
modelo_log |> summary()

Call:
lm(formula = log(load_dp) ~ crop, data = datos)

Residuals:
   Min     1Q Median     3Q    Max 
-5.115 -0.863  0.137  1.151  3.865 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.84728    0.35338  13.717   <2e-16 ***
cropcorn    -0.06289    0.39240  -0.160   0.8728    
cropforage  -0.23586    0.46444  -0.508   0.6121    
croppasture  0.96111    0.45375   2.118   0.0352 *  
cropsoy      0.23965    0.44914   0.534   0.5942    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.731 on 231 degrees of freedom
Multiple R-squared:  0.0477,    Adjusted R-squared:  0.03121 
F-statistic: 2.893 on 4 and 231 DF,  p-value: 0.023

Matriz de diseño

Code
matriz_dis <-
  model.matrix(load_dp ~ crop, data = datos)

matriz_dis |> head()
  (Intercept) cropcorn cropforage croppasture cropsoy
1           1        1          0           0       0
2           1        0          0           0       0
3           1        0          0           0       1
4           1        1          0           0       0
5           1        1          0           0       0
6           1        0          0           0       1

Ecuación final

  • Ejemplo de predicció en maíz:

\[\hat{y} = 227.13 + (\beta1 \times corn_i) + (\beta2 \times forage_i) + (\beta3 \times pasture_i) + (\beta4 \times soy_i) + \epsilon_i\]

  • Predicción con R:
Code
ejemplo_prediccion <- datos |> 
  slice(73)

predict(object = modelo_ori, newdata = ejemplo_prediccion)
  1 
310 
  • Predicción manual:

\[\hat{y} = 227.13 + (189.67 \times 0) + (82.87 \times 1) + (614.04 \times 0) + (221.44 \times 0)\]

Code
227.13 + (82.87 * 1)
[1] 310

Sobreajuste de modelos

Otra forma de ajustar la regresión

rsample: remuestreo

Code
library(rsample)

# Declarar la partición inicial
set.seed(2024)
particion <- initial_split(data = datos, prop = 0.70, strata = "crop")

# Extrear los conjuntos de train-test
datos_train <- training(particion)
datos_test <- testing(particion)

recipes: preprocesado

  • En este caso usaremos la imputación a través de la mediana.
Code
library(recipes)
receta <- recipe(load_dp ~ soil_p_h_6, data = datos_train) |> 
  step_impute_median(soil_p_h_6)

# Verificamos que no haya datos con NA en train
receta |> 
  prep() |> 
  bake(new_data = datos_train)
  • Ahora podemos aplicar nuestra receta a nuevos datos (datos_test):
Code
receta |> 
  prep() |> 
  bake(new_data = datos_test)

parsnip: modelo

Code
library(parsnip)
modelo <- linear_reg(engine = "lm")
  • Ahora creamos el flujo completo de la modelación:
Code
library(workflows)
flujo_modelo <- workflow() |> 
  add_recipe(receta) |> 
  add_model(modelo)

flujo_modelo
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
1 Recipe Step

• step_impute_median()

── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: lm 
  • Ajustar el modelo:
Code
ajuste_modelo <- flujo_modelo |> 
  fit(datos_train)

ajuste_modelo
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
1 Recipe Step

• step_impute_median()

── Model ───────────────────────────────────────────────────────────────────────

Call:
stats::lm(formula = ..y ~ ., data = data)

Coefficients:
(Intercept)   soil_p_h_6  
    -1532.4        285.9  
  • Ahora podemos realizar predicciones con el modelo sobre los nuevos datos (datos_test):
Code
predicciones_test <- predict(object = ajuste_modelo, new_data = datos_test)
predicciones_test

yardstick: evaluación

  • Train:
Code
library(yardstick)
predicciones_train <- predict(object = ajuste_modelo, new_data = datos_train)

rmse_vec(truth = datos_train$load_dp, 
         estimate = predicciones_train$.pred)
[1] 722.0435
  • Test:
Code
rmse_vec(truth = datos_test$load_dp, 
         estimate = predicciones_test$.pred)
[1] 859.2727