Code
library(tidyverse)
library(ggpubr)
library(qqplotr)
library(janitor)
Transformaciones y predictoras categóricas
library(tidyverse)
library(ggpubr)
library(qqplotr)
library(janitor)
<- read_csv("datos/Discovery Farms Dataset.csv") |>
datos select(Crop, Soil_Condition, Soil_pH_6, Precip_Frozen, `Precip_Non-frozen`, Load_DP) |>
clean_names() |>
filter(load_dp > 0)
|> head() datos
$load_dp |>
datosggqqplot()
$load_dp |>
datoslog() |>
ggqqplot()
|>
datos ggplot(aes(x = soil_p_h_6, y = load_dp)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
|>
datos ggplot(aes(x = soil_p_h_6, y = load_dp)) +
geom_point() +
scale_y_log10() +
scale_x_log10() +
geom_smooth(method = "lm", se = FALSE)
|>
datos ggplot(aes(x = crop, y = load_dp)) +
geom_boxplot()
|>
datos ggplot(aes(x = soil_condition, y = load_dp)) +
geom_boxplot()
|>
datos ggplot(aes(x = crop, y = load_dp)) +
geom_boxplot() +
scale_y_log10()
|>
datos ggplot(aes(x = soil_condition, y = load_dp)) +
geom_boxplot() +
scale_y_log10()
|>
datos ggplot(aes(x = crop, y = load_dp)) +
facet_wrap(~soil_condition) +
geom_boxplot()
|>
datos ggplot(aes(x = crop, y = load_dp)) +
facet_wrap(~soil_condition) +
geom_boxplot() +
scale_y_log10()
|>
datos group_by(crop) |>
reframe(promedio = mean(load_dp),
minimo = min(load_dp),
maximo = max(load_dp),
desviacion = sd(load_dp),
coef_var = (desviacion / promedio) * 100)
|>
datos mutate(log_load_dp = log(load_dp)) |>
group_by(crop) |>
reframe(promedio = mean(log_load_dp, na.rm = TRUE),
mediana = median(log_load_dp, na.rm = TRUE),
desviacion = sd(log_load_dp)) |>
mutate(
promedio = exp(promedio),
desviacion = exp(desviacion),
mediana = exp(mediana),
coef_var = (desviacion / promedio) * 100
)
<- lm(load_dp ~ crop, data = datos)
modelo_ori |> summary() modelo_ori
Call:
lm(formula = load_dp ~ crop, data = datos)
Residuals:
Min 1Q Median 3Q Max
-839.2 -346.6 -218.3 26.4 5288.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 227.13 156.94 1.447 0.14918
cropcorn 189.67 174.26 1.088 0.27754
cropforage 82.87 206.25 0.402 0.68820
croppasture 614.04 201.51 3.047 0.00258 **
cropsoy 221.44 199.46 1.110 0.26807
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 768.8 on 231 degrees of freedom
Multiple R-squared: 0.05279, Adjusted R-squared: 0.03639
F-statistic: 3.218 on 4 and 231 DF, p-value: 0.01351
<- lm(log(load_dp) ~ crop, data = datos)
modelo_log |> summary() modelo_log
Call:
lm(formula = log(load_dp) ~ crop, data = datos)
Residuals:
Min 1Q Median 3Q Max
-5.115 -0.863 0.137 1.151 3.865
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.84728 0.35338 13.717 <2e-16 ***
cropcorn -0.06289 0.39240 -0.160 0.8728
cropforage -0.23586 0.46444 -0.508 0.6121
croppasture 0.96111 0.45375 2.118 0.0352 *
cropsoy 0.23965 0.44914 0.534 0.5942
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.731 on 231 degrees of freedom
Multiple R-squared: 0.0477, Adjusted R-squared: 0.03121
F-statistic: 2.893 on 4 and 231 DF, p-value: 0.023
<-
matriz_dis model.matrix(load_dp ~ crop, data = datos)
|> head() matriz_dis
(Intercept) cropcorn cropforage croppasture cropsoy
1 1 1 0 0 0
2 1 0 0 0 0
3 1 0 0 0 1
4 1 1 0 0 0
5 1 1 0 0 0
6 1 0 0 0 1
\[\hat{y} = 227.13 + (\beta1 \times corn_i) + (\beta2 \times forage_i) + (\beta3 \times pasture_i) + (\beta4 \times soy_i) + \epsilon_i\]
<- datos |>
ejemplo_prediccion slice(73)
predict(object = modelo_ori, newdata = ejemplo_prediccion)
1
310
\[\hat{y} = 227.13 + (189.67 \times 0) + (82.87 \times 1) + (614.04 \times 0) + (221.44 \times 0)\]
227.13 + (82.87 * 1)
[1] 310
rsample
: remuestreolibrary(rsample)
# Declarar la partición inicial
set.seed(2024)
<- initial_split(data = datos, prop = 0.70, strata = "crop")
particion
# Extrear los conjuntos de train-test
<- training(particion)
datos_train <- testing(particion) datos_test
recipes
: preprocesadolibrary(recipes)
<- recipe(load_dp ~ soil_p_h_6, data = datos_train) |>
receta step_impute_median(soil_p_h_6)
# Verificamos que no haya datos con NA en train
|>
receta prep() |>
bake(new_data = datos_train)
|>
receta prep() |>
bake(new_data = datos_test)
parsnip
: modelolibrary(parsnip)
<- linear_reg(engine = "lm") modelo
library(workflows)
<- workflow() |>
flujo_modelo add_recipe(receta) |>
add_model(modelo)
flujo_modelo
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
1 Recipe Step
• step_impute_median()
── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)
Computational engine: lm
<- flujo_modelo |>
ajuste_modelo fit(datos_train)
ajuste_modelo
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
1 Recipe Step
• step_impute_median()
── Model ───────────────────────────────────────────────────────────────────────
Call:
stats::lm(formula = ..y ~ ., data = data)
Coefficients:
(Intercept) soil_p_h_6
-1532.4 285.9
<- predict(object = ajuste_modelo, new_data = datos_test)
predicciones_test predicciones_test
yardstick
: evaluaciónlibrary(yardstick)
<- predict(object = ajuste_modelo, new_data = datos_train)
predicciones_train
rmse_vec(truth = datos_train$load_dp,
estimate = predicciones_train$.pred)
[1] 722.0435
rmse_vec(truth = datos_test$load_dp,
estimate = predicciones_test$.pred)
[1] 859.2727