Code
library(tidyverse)
library(ggpubr)
library(qqplotr)
Diseño experimental
library(tidyverse)
library(ggpubr)
library(qqplotr)
<- read_csv("datos/lecheria_eeuu.csv")
datos |> head() datos
ggqqplot(datos$avg_price_milk)
|>
datos select(-year) |>
pivot_longer(cols = everything()) |>
ggplot(aes(sample = value)) +
facet_wrap(~name, scales = "free") +
stat_qq_band() +
stat_qq_line() +
stat_qq_point()
\[\rho_{(X,Y)} = \frac{Cov_{(X,Y)}}{\sigma_X\times\sigma_Y} = \frac{\sum_{i=1}^{n}(X_i-\mu_X)(Y_i-\mu_Y)}{\sigma_X\times\sigma_Y}\]
<- cov(x = datos$dairy_ration, y = datos$avg_price_milk)
covarianza <- sd(datos$dairy_ration)
desv_x <- sd(datos$avg_price_milk)
desv_y
/ (desv_x * desv_y) covarianza
[1] 0.8172358
cor(x = datos$dairy_ration,
y = datos$avg_price_milk)
[1] 0.8172358
\[\rho = 1 - \frac{6\sum D^2}{N (N^2 - 1)}\]
cor(x = datos$dairy_ration,
y = datos$avg_price_milk,
method = "spearman")
[1] 0.63335
|>
datos ggplot(aes(x = dairy_ration, y = avg_price_milk)) +
geom_point()
|>
datos ggplot(aes(x = dairy_ration, y = avg_price_milk)) +
geom_point() +
geom_smooth(method = "lm")
|>
datos ggplot(aes(x = dairy_ration, y = avg_price_milk)) +
geom_point() +
geom_smooth()
\[Y_i = \beta_0\ +\ \beta_1X\] \[\hat{Y_i} = \hat{\beta_0}\ + \hat{\beta_1}X_i\ + \hat{\epsilon_i}\]
\[Y_i = E(Y|X_i)\ +\ \epsilon_i\\\]
\[Y_i = E(Y|X_i)\ +\ \epsilon_i\\ Y_i = \beta_0 +\ \beta_1X_i\ +\ \epsilon_i\]
<- lm(avg_price_milk ~ dairy_ration, data = datos)
modelo1 |> summary() modelo1
Call:
lm(formula = avg_price_milk ~ dairy_ration, data = datos)
Residuals:
Min 1Q Median 3Q Max
-0.033635 -0.008394 -0.002785 0.003106 0.055094
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.086176 0.007885 10.930 1.67e-12 ***
dairy_ration 1.038172 0.127443 8.146 2.10e-09 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.01655 on 33 degrees of freedom
Multiple R-squared: 0.6679, Adjusted R-squared: 0.6578
F-statistic: 66.36 on 1 and 33 DF, p-value: 2.102e-09
\[\hat{y} = 0.086176 + (1.038172 \times X)\]
0.086176 + (1.038172 * 0.082)
[1] 0.1713061
<-
datos_test |>
datos sample_n(size = 10) |>
select(dairy_ration)
datos_test
predict(object = modelo1,
newdata = datos_test)
1 2 3 4 5 6 7 8
0.2077383 0.1315572 0.1384699 0.1373187 0.1978416 0.2123142 0.1384508 0.1360335
9 10
0.1849064 0.1445377
\[R^2 = \rho^2\]
<- 0.8172358
correlacion_pearson <- correlacion_pearson ^ 2
r2 r2
[1] 0.6678744
\[R^2_a = 1 - \frac{N-1}{N-k-1} (1 - R^2)\]
<- nrow(datos) # número de datos
N <- 1 # pendiente (parámetro Beta1)
k
1 - ((N - 1) / (N - k - 1)) * (1 - r2)
[1] 0.6578099
\[RMSE = \sqrt{\frac{\sum_{i = 1}^{n} (y - \hat{y})^2}{N}}\]
<- datos$avg_price_milk # variable y
valores_reales <- modelo1$fitted.values
predichos_m1
<- (valores_reales - predichos_m1)^2
diferencia_cuadrado <- sum(diferencia_cuadrado)
suma_cuadrados <- sqrt(suma_cuadrados / N)
metrica_rmse metrica_rmse
[1] 0.01606818
library(yardstick)
rmse_vec(truth = valores_reales, estimate = predichos_m1)
[1] 0.01606818
\[AIC = -2ln(L) + k\times n_{par}\]
<- logLik(modelo1) # log verosimilitud
log_l <- length(coefficients(modelo1)) # número de parámetros
n_par
-2 * log_l + 2 * n_par
'log Lik.' -185.8383 (df=3)
AIC(modelo1)
[1] -183.8383
<- modelo1$residuals
residuales residuales
1 2 3 4 5
-0.0063963730 -0.0004507765 0.0039146871 -0.0045249485 -0.0066020776
6 7 8 9 10
-0.0058987810 -0.0003646907 0.0030543723 -0.0131944037 -0.0027847996
11 12 13 14 15
0.0004026023 -0.0107404425 -0.0005572382 -0.0059314158 -0.0073187107
16 17 18 19 20
-0.0101901072 -0.0010347143 -0.0115376727 0.0197303289 0.0161825420
21 22 23 24 25
-0.0051450271 0.0180051551 -0.0137001647 -0.0110334541 0.0208318798
26 27 28 29 30
0.0162116339 -0.0094699308 0.0342720367 0.0016622358 -0.0336354964
31 32 33 34 35
0.0020442379 0.0031583847 -0.0273141708 -0.0067382644 0.0550935636
- predichos_m1 valores_reales
1 2 3 4 5
-0.0063963730 -0.0004507765 0.0039146871 -0.0045249485 -0.0066020776
6 7 8 9 10
-0.0058987810 -0.0003646907 0.0030543723 -0.0131944037 -0.0027847996
11 12 13 14 15
0.0004026023 -0.0107404425 -0.0005572382 -0.0059314158 -0.0073187107
16 17 18 19 20
-0.0101901072 -0.0010347143 -0.0115376727 0.0197303289 0.0161825420
21 22 23 24 25
-0.0051450271 0.0180051551 -0.0137001647 -0.0110334541 0.0208318798
26 27 28 29 30
0.0162116339 -0.0094699308 0.0342720367 0.0016622358 -0.0336354964
31 32 33 34 35
0.0020442379 0.0031583847 -0.0273141708 -0.0067382644 0.0550935636
# residuales |> hist()
|>
residuales enframe() |>
ggplot(aes(x = value)) +
geom_histogram(bins = 7, color = "black")
ggqqplot(residuales)
par(mfrow = c(2, 2)) # cuatro gráficos
|>
modelo1 plot()
<-
data_residuales data.frame(predichos = predichos_m1, residuales = residuales)
|>
data_residuales ggplot(aes(x = predichos, y = residuales)) +
geom_point() +
geom_hline(yintercept = 0, color = "red", linetype = 2) +
geom_smooth(se = FALSE)
|>
datos pivot_longer(cols = -avg_price_milk) |>
ggplot(aes(x = value, y = avg_price_milk)) +
facet_wrap(~name, ncol = 3, scales = "free_x") +
geom_point() +
geom_smooth(method = "lm")
<- lm(avg_price_milk ~ alfalfa_hay_price, data = datos)
modelo2 <- lm(avg_price_milk ~ milk_feed_price_ratio, data = datos)
modelo3 <- lm(avg_price_milk ~ milk_per_cow, data = datos) modelo4
AIC(modelo1, modelo2, modelo3, modelo4)
<- modelo2$fitted.values
predichos_m2 <- modelo3$fitted.values
predichos_m3 <- modelo4$fitted.values
predichos_m4
rmse_vec(truth = valores_reales, estimate = predichos_m1)
[1] 0.01606818
rmse_vec(truth = valores_reales, estimate = predichos_m2)
[1] 0.01409574
rmse_vec(truth = valores_reales, estimate = predichos_m3)
[1] 0.02553965
rmse_vec(truth = valores_reales, estimate = predichos_m4)
[1] 0.0207514
ggqqplot(modelo1$residuals)
ggqqplot(modelo2$residuals)
ggqqplot(modelo3$residuals)
ggqqplot(modelo4$residuals)
data.frame(residuales = modelo1$residuals,
predichos = modelo1$fitted.values) |>
ggplot(aes(x = predichos, y = residuales)) +
geom_point() +
geom_hline(yintercept = 0,
color = "red",
linetype = 2) +
geom_smooth(se = FALSE)
data.frame(residuales = modelo2$residuals,
predichos = modelo2$fitted.values) |>
ggplot(aes(x = predichos, y = residuales)) +
geom_point() +
geom_hline(yintercept = 0,
color = "red",
linetype = 2) +
geom_smooth(se = FALSE)
data.frame(residuales = modelo3$residuals,
predichos = modelo3$fitted.values) |>
ggplot(aes(x = predichos, y = residuales)) +
geom_point() +
geom_hline(yintercept = 0,
color = "red",
linetype = 2) +
geom_smooth(se = FALSE)
data.frame(residuales = modelo4$residuals,
predichos = modelo4$fitted.values) |>
ggplot(aes(x = predichos, y = residuales)) +
geom_point() +
geom_hline(yintercept = 0,
color = "red",
linetype = 2) +
geom_smooth(se = FALSE)